openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp

   1 //===----RTLs/amdgpu/src/rtl.cpp - Target RTLs Implementation ----- C++ -*-===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // RTL NextGen for AMDGPU machine
  10 //
  11 //===----------------------------------------------------------------------===//
  12
  13 #include <atomic>
  14 #include <cassert>
  15 #include <cstddef>
  16 #include <deque>
  17 #include <mutex>
  18 #include <string>
  19 #include <system_error>
  20 #include <unistd.h>
  21 #include <unordered_map>
  22
  23 #include "Debug.h"
  24 #include "Environment.h"
  25 #include "GlobalHandler.h"
  26 #include "OmptCallback.h"
  27 #include "PluginInterface.h"
  28 #include "Utilities.h"
  29 #include "UtilitiesRTL.h"
  30 #include "omptarget.h"
  31
  32 #include "llvm/ADT/SmallString.h"
  33 #include "llvm/ADT/SmallVector.h"
  34 #include "llvm/ADT/StringRef.h"
  35 #include "llvm/BinaryFormat/ELF.h"
  36 #include "llvm/Frontend/OpenMP/OMPConstants.h"
  37 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
  38 #include "llvm/Support/Error.h"
  39 #include "llvm/Support/FileSystem.h"
  40 #include "llvm/Support/MemoryBuffer.h"
  41 #include "llvm/Support/Program.h"
  42 #include "llvm/Support/raw_ostream.h"
  43
  44 #if defined(__has_include)
  45 #if __has_include("hsa/hsa.h")
  46 #include "hsa/hsa.h"
  47 #include "hsa/hsa_ext_amd.h"
  48 #elif __has_include("hsa.h")
  49 #include "hsa.h"
  50 #include "hsa_ext_amd.h"
  51 #endif
  52 #else
  53 #include "hsa/hsa.h"
  54 #include "hsa/hsa_ext_amd.h"
  55 #endif
  56
  57 namespace llvm {
  58 namespace omp {
  59 namespace target {
  60 namespace plugin {
  61
  62 /// Forward declarations for all specialized data structures.
  63 struct AMDGPUKernelTy;
  64 struct AMDGPUDeviceTy;
  65 struct AMDGPUPluginTy;
  66 struct AMDGPUStreamTy;
  67 struct AMDGPUEventTy;
  68 struct AMDGPUStreamManagerTy;
  69 struct AMDGPUEventManagerTy;
  70 struct AMDGPUDeviceImageTy;
  71 struct AMDGPUMemoryManagerTy;
  72 struct AMDGPUMemoryPoolTy;
  73
  74 namespace utils {
  75
  76 /// Iterate elements using an HSA iterate function. Do not use this function
  77 /// directly but the specialized ones below instead.
  78 template <typename ElemTy, typename IterFuncTy, typename CallbackTy>
  79 hsa_status_t iterate(IterFuncTy Func, CallbackTy Cb) {
  80   auto L = [](ElemTy Elem, void *Data) -> hsa_status_t {
  81     CallbackTy *Unwrapped = static_cast<CallbackTy *>(Data);
  82     return (*Unwrapped)(Elem);
  83   };
  84   return Func(L, static_cast<void *>(&Cb));
  85 }
  86
  87 /// Iterate elements using an HSA iterate function passing a parameter. Do not
  88 /// use this function directly but the specialized ones below instead.
  89 template <typename ElemTy, typename IterFuncTy, typename IterFuncArgTy,
  90           typename CallbackTy>
  91 hsa_status_t iterate(IterFuncTy Func, IterFuncArgTy FuncArg, CallbackTy Cb) {
  92   auto L = [](ElemTy Elem, void *Data) -> hsa_status_t {
  93     CallbackTy *Unwrapped = static_cast<CallbackTy *>(Data);
  94     return (*Unwrapped)(Elem);
  95   };
  96   return Func(FuncArg, L, static_cast<void *>(&Cb));
  97 }
  98
  99 /// Iterate elements using an HSA iterate function passing a parameter. Do not
 100 /// use this function directly but the specialized ones below instead.
 101 template <typename Elem1Ty, typename Elem2Ty, typename IterFuncTy,
 102           typename IterFuncArgTy, typename CallbackTy>
 103 hsa_status_t iterate(IterFuncTy Func, IterFuncArgTy FuncArg, CallbackTy Cb) {
 104   auto L = [](Elem1Ty Elem1, Elem2Ty Elem2, void *Data) -> hsa_status_t {
 105     CallbackTy *Unwrapped = static_cast<CallbackTy *>(Data);
 106     return (*Unwrapped)(Elem1, Elem2);
 107   };
 108   return Func(FuncArg, L, static_cast<void *>(&Cb));
 109 }
 110
 111 /// Iterate agents.
 112 template <typename CallbackTy> Error iterateAgents(CallbackTy Callback) {
 113   hsa_status_t Status = iterate<hsa_agent_t>(hsa_iterate_agents, Callback);
 114   return Plugin::check(Status, "Error in hsa_iterate_agents: %s");
 115 }
 116
 117 /// Iterate ISAs of an agent.
 118 template <typename CallbackTy>
 119 Error iterateAgentISAs(hsa_agent_t Agent, CallbackTy Cb) {
 120   hsa_status_t Status = iterate<hsa_isa_t>(hsa_agent_iterate_isas, Agent, Cb);
 121   return Plugin::check(Status, "Error in hsa_agent_iterate_isas: %s");
 122 }
 123
 124 /// Iterate memory pools of an agent.
 125 template <typename CallbackTy>
 126 Error iterateAgentMemoryPools(hsa_agent_t Agent, CallbackTy Cb) {
 127   hsa_status_t Status = iterate<hsa_amd_memory_pool_t>(
 128       hsa_amd_agent_iterate_memory_pools, Agent, Cb);
 129   return Plugin::check(Status,
 130                        "Error in hsa_amd_agent_iterate_memory_pools: %s");
 131 }
 132
 133 } // namespace utils
 134
 135 /// Utility class representing generic resource references to AMDGPU resources.
 136 template <typename ResourceTy>
 137 struct AMDGPUResourceRef : public GenericDeviceResourceRef {
 138   /// The underlying handle type for resources.
 139   using HandleTy = ResourceTy *;
 140
 141   /// Create an empty reference to an invalid resource.
 142   AMDGPUResourceRef() : Resource(nullptr) {}
 143
 144   /// Create a reference to an existing resource.
 145   AMDGPUResourceRef(HandleTy Resource) : Resource(Resource) {}
 146
 147   virtual ~AMDGPUResourceRef() {}
 148
 149   /// Create a new resource and save the reference. The reference must be empty
 150   /// before calling to this function.
 151   Error create(GenericDeviceTy &Device) override;
 152
 153   /// Destroy the referenced resource and invalidate the reference. The
 154   /// reference must be to a valid resource before calling to this function.
 155   Error destroy(GenericDeviceTy &Device) override {
 156     if (!Resource)
 157       return Plugin::error("Destroying an invalid resource");
 158
 159     if (auto Err = Resource->deinit())
 160       return Err;
 161
 162     delete Resource;
 163
 164     Resource = nullptr;
 165     return Plugin::success();
 166   }
 167
 168   /// Get the underlying resource handle.
 169   operator HandleTy() const { return Resource; }
 170
 171 private:
 172   /// The handle to the actual resource.
 173   HandleTy Resource;
 174 };
 175
 176 /// Class holding an HSA memory pool.
 177 struct AMDGPUMemoryPoolTy {
 178   /// Create a memory pool from an HSA memory pool.
 179   AMDGPUMemoryPoolTy(hsa_amd_memory_pool_t MemoryPool)
 180       : MemoryPool(MemoryPool), GlobalFlags(0) {}
 181
 182   /// Initialize the memory pool retrieving its properties.
 183   Error init() {
 184     if (auto Err = getAttr(HSA_AMD_MEMORY_POOL_INFO_SEGMENT, Segment))
 185       return Err;
 186
 187     if (auto Err = getAttr(HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, GlobalFlags))
 188       return Err;
 189
 190     return Plugin::success();
 191   }
 192
 193   /// Getter of the HSA memory pool.
 194   hsa_amd_memory_pool_t get() const { return MemoryPool; }
 195
 196   /// Indicate the segment which belongs to.
 197   bool isGlobal() const { return (Segment == HSA_AMD_SEGMENT_GLOBAL); }
 198   bool isReadOnly() const { return (Segment == HSA_AMD_SEGMENT_READONLY); }
 199   bool isPrivate() const { return (Segment == HSA_AMD_SEGMENT_PRIVATE); }
 200   bool isGroup() const { return (Segment == HSA_AMD_SEGMENT_GROUP); }
 201
 202   /// Indicate if it is fine-grained memory. Valid only for global.
 203   bool isFineGrained() const {
 204     assert(isGlobal() && "Not global memory");
 205     return (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED);
 206   }
 207
 208   /// Indicate if it is coarse-grained memory. Valid only for global.
 209   bool isCoarseGrained() const {
 210     assert(isGlobal() && "Not global memory");
 211     return (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED);
 212   }
 213
 214   /// Indicate if it supports storing kernel arguments. Valid only for global.
 215   bool supportsKernelArgs() const {
 216     assert(isGlobal() && "Not global memory");
 217     return (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT);
 218   }
 219
 220   /// Allocate memory on the memory pool.
 221   Error allocate(size_t Size, void **PtrStorage) {
 222     hsa_status_t Status =
 223         hsa_amd_memory_pool_allocate(MemoryPool, Size, 0, PtrStorage);
 224     return Plugin::check(Status, "Error in hsa_amd_memory_pool_allocate: %s");
 225   }
 226
 227   /// Return memory to the memory pool.
 228   Error deallocate(void *Ptr) {
 229     hsa_status_t Status = hsa_amd_memory_pool_free(Ptr);
 230     return Plugin::check(Status, "Error in hsa_amd_memory_pool_free: %s");
 231   }
 232
 233   /// Allow the device to access a specific allocation.
 234   Error enableAccess(void *Ptr, int64_t Size,
 235                      const llvm::SmallVector<hsa_agent_t> &Agents) const {
 236 #ifdef OMPTARGET_DEBUG
 237     for (hsa_agent_t Agent : Agents) {
 238       hsa_amd_memory_pool_access_t Access;
 239       if (auto Err =
 240               getAttr(Agent, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, Access))
 241         return Err;
 242
 243       // The agent is not allowed to access the memory pool in any case. Do not
 244       // continue because otherwise it result in undefined behavior.
 245       if (Access == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED)
 246         return Plugin::error("An agent is not allowed to access a memory pool");
 247     }
 248 #endif
 249
 250     // We can access but it is disabled by default. Enable the access then.
 251     hsa_status_t Status =
 252         hsa_amd_agents_allow_access(Agents.size(), Agents.data(), nullptr, Ptr);
 253     return Plugin::check(Status, "Error in hsa_amd_agents_allow_access: %s");
 254   }
 255
 256   /// Get attribute from the memory pool.
 257   template <typename Ty>
 258   Error getAttr(hsa_amd_memory_pool_info_t Kind, Ty &Value) const {
 259     hsa_status_t Status;
 260     Status = hsa_amd_memory_pool_get_info(MemoryPool, Kind, &Value);
 261     return Plugin::check(Status, "Error in hsa_amd_memory_pool_get_info: %s");
 262   }
 263
 264   template <typename Ty>
 265   hsa_status_t getAttrRaw(hsa_amd_memory_pool_info_t Kind, Ty &Value) const {
 266     return hsa_amd_memory_pool_get_info(MemoryPool, Kind, &Value);
 267   }
 268
 269   /// Get attribute from the memory pool relating to an agent.
 270   template <typename Ty>
 271   Error getAttr(hsa_agent_t Agent, hsa_amd_agent_memory_pool_info_t Kind,
 272                 Ty &Value) const {
 273     hsa_status_t Status;
 274     Status =
 275         hsa_amd_agent_memory_pool_get_info(Agent, MemoryPool, Kind, &Value);
 276     return Plugin::check(Status,
 277                          "Error in hsa_amd_agent_memory_pool_get_info: %s");
 278   }
 279
 280 private:
 281   /// The HSA memory pool.
 282   hsa_amd_memory_pool_t MemoryPool;
 283
 284   /// The segment where the memory pool belongs to.
 285   hsa_amd_segment_t Segment;
 286
 287   /// The global flags of memory pool. Only valid if the memory pool belongs to
 288   /// the global segment.
 289   uint32_t GlobalFlags;
 290 };
 291
 292 /// Class that implements a memory manager that gets memory from a specific
 293 /// memory pool.
 294 struct AMDGPUMemoryManagerTy : public DeviceAllocatorTy {
 295
 296   /// Create an empty memory manager.
 297   AMDGPUMemoryManagerTy() : MemoryPool(nullptr), MemoryManager(nullptr) {}
 298
 299   /// Initialize the memory manager from a memory pool.
 300   Error init(AMDGPUMemoryPoolTy &MemoryPool) {
 301     const uint32_t Threshold = 1 << 30;
 302     this->MemoryManager = new MemoryManagerTy(*this, Threshold);
 303     this->MemoryPool = &MemoryPool;
 304     return Plugin::success();
 305   }
 306
 307   /// Deinitialize the memory manager and free its allocations.
 308   Error deinit() {
 309     assert(MemoryManager && "Invalid memory manager");
 310
 311     // Delete and invalidate the memory manager. At this point, the memory
 312     // manager will deallocate all its allocations.
 313     delete MemoryManager;
 314     MemoryManager = nullptr;
 315
 316     return Plugin::success();
 317   }
 318
 319   /// Reuse or allocate memory through the memory manager.
 320   Error allocate(size_t Size, void **PtrStorage) {
 321     assert(MemoryManager && "Invalid memory manager");
 322     assert(PtrStorage && "Invalid pointer storage");
 323
 324     *PtrStorage = MemoryManager->allocate(Size, nullptr);
 325     if (*PtrStorage == nullptr)
 326       return Plugin::error("Failure to allocate from AMDGPU memory manager");
 327
 328     return Plugin::success();
 329   }
 330
 331   /// Release an allocation to be reused.
 332   Error deallocate(void *Ptr) {
 333     assert(Ptr && "Invalid pointer");
 334
 335     if (MemoryManager->free(Ptr))
 336       return Plugin::error("Failure to deallocate from AMDGPU memory manager");
 337
 338     return Plugin::success();
 339   }
 340
 341 private:
 342   /// Allocation callback that will be called once the memory manager does not
 343   /// have more previously allocated buffers.
 344   void *allocate(size_t Size, void *HstPtr, TargetAllocTy Kind) override;
 345
 346   /// Deallocation callack that will be called by the memory manager.
 347   int free(void *TgtPtr, TargetAllocTy Kind) override {
 348     if (auto Err = MemoryPool->deallocate(TgtPtr)) {
 349       consumeError(std::move(Err));
 350       return OFFLOAD_FAIL;
 351     }
 352     return OFFLOAD_SUCCESS;
 353   }
 354
 355   /// The memory pool used to allocate memory.
 356   AMDGPUMemoryPoolTy *MemoryPool;
 357
 358   /// Reference to the actual memory manager.
 359   MemoryManagerTy *MemoryManager;
 360 };
 361
 362 /// Class implementing the AMDGPU device images' properties.
 363 struct AMDGPUDeviceImageTy : public DeviceImageTy {
 364   /// Create the AMDGPU image with the id and the target image pointer.
 365   AMDGPUDeviceImageTy(int32_t ImageId, const __tgt_device_image *TgtImage)
 366       : DeviceImageTy(ImageId, TgtImage) {}
 367
 368   /// Prepare and load the executable corresponding to the image.
 369   Error loadExecutable(const AMDGPUDeviceTy &Device);
 370
 371   /// Unload the executable.
 372   Error unloadExecutable() {
 373     hsa_status_t Status = hsa_executable_destroy(Executable);
 374     if (auto Err = Plugin::check(Status, "Error in hsa_executable_destroy: %s"))
 375       return Err;
 376
 377     Status = hsa_code_object_destroy(CodeObject);
 378     return Plugin::check(Status, "Error in hsa_code_object_destroy: %s");
 379   }
 380
 381   /// Get the executable.
 382   hsa_executable_t getExecutable() const { return Executable; }
 383
 384   /// Get to Code Object Version of the ELF
 385   uint16_t getELFABIVersion() const { return ELFABIVersion; }
 386
 387   /// Find an HSA device symbol by its name on the executable.
 388   Expected<hsa_executable_symbol_t>
 389   findDeviceSymbol(GenericDeviceTy &Device, StringRef SymbolName) const;
 390
 391   /// Get additional info for kernel, e.g., register spill counts
 392   std::optional<utils::KernelMetaDataTy>
 393   getKernelInfo(StringRef Identifier) const {
 394     auto It = KernelInfoMap.find(Identifier);
 395
 396     if (It == KernelInfoMap.end())
 397       return {};
 398
 399     return It->second;
 400   }
 401
 402 private:
 403   /// The exectuable loaded on the agent.
 404   hsa_executable_t Executable;
 405   hsa_code_object_t CodeObject;
 406   StringMap<utils::KernelMetaDataTy> KernelInfoMap;
 407   uint16_t ELFABIVersion;
 408 };
 409
 410 /// Class implementing the AMDGPU kernel functionalities which derives from the
 411 /// generic kernel class.
 412 struct AMDGPUKernelTy : public GenericKernelTy {
 413   /// Create an AMDGPU kernel with a name and an execution mode.
 414   AMDGPUKernelTy(const char *Name) : GenericKernelTy(Name) {}
 415
 416   /// Initialize the AMDGPU kernel.
 417   Error initImpl(GenericDeviceTy &Device, DeviceImageTy &Image) override {
 418     AMDGPUDeviceImageTy &AMDImage = static_cast<AMDGPUDeviceImageTy &>(Image);
 419
 420     // Kernel symbols have a ".kd" suffix.
 421     std::string KernelName(getName());
 422     KernelName += ".kd";
 423
 424     // Find the symbol on the device executable.
 425     auto SymbolOrErr = AMDImage.findDeviceSymbol(Device, KernelName);
 426     if (!SymbolOrErr)
 427       return SymbolOrErr.takeError();
 428
 429     hsa_executable_symbol_t Symbol = *SymbolOrErr;
 430     hsa_symbol_kind_t SymbolType;
 431     hsa_status_t Status;
 432
 433     // Retrieve different properties of the kernel symbol.
 434     std::pair<hsa_executable_symbol_info_t, void *> RequiredInfos[] = {
 435         {HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &SymbolType},
 436         {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &KernelObject},
 437         {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &ArgsSize},
 438         {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &GroupSize},
 439         {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &PrivateSize}};
 440
 441     for (auto &Info : RequiredInfos) {
 442       Status = hsa_executable_symbol_get_info(Symbol, Info.first, Info.second);
 443       if (auto Err = Plugin::check(
 444               Status, "Error in hsa_executable_symbol_get_info: %s"))
 445         return Err;
 446     }
 447
 448     // Make sure it is a kernel symbol.
 449     if (SymbolType != HSA_SYMBOL_KIND_KERNEL)
 450       return Plugin::error("Symbol %s is not a kernel function");
 451
 452     // TODO: Read the kernel descriptor for the max threads per block. May be
 453     // read from the image.
 454
 455     ImplicitArgsSize = utils::getImplicitArgsSize(AMDImage.getELFABIVersion());
 456     DP("ELFABIVersion: %d\n", AMDImage.getELFABIVersion());
 457
 458     // Get additional kernel info read from image
 459     KernelInfo = AMDImage.getKernelInfo(getName());
 460     if (!KernelInfo.has_value())
 461       INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device.getDeviceId(),
 462            "Could not read extra information for kernel %s.", getName());
 463
 464     return Plugin::success();
 465   }
 466
 467   /// Launch the AMDGPU kernel function.
 468   Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads,
 469                    uint64_t NumBlocks, KernelArgsTy &KernelArgs, void *Args,
 470                    AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
 471
 472   /// Print more elaborate kernel launch info for AMDGPU
 473   Error printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
 474                                KernelArgsTy &KernelArgs, uint32_t NumThreads,
 475                                uint64_t NumBlocks) const override;
 476
 477   /// Get group and private segment kernel size.
 478   uint32_t getGroupSize() const { return GroupSize; }
 479   uint32_t getPrivateSize() const { return PrivateSize; }
 480
 481   /// Get the HSA kernel object representing the kernel function.
 482   uint64_t getKernelObject() const { return KernelObject; }
 483
 484   /// Get the size of implicitargs based on the code object version
 485   /// @return 56 for cov4 and 256 for cov5
 486   uint32_t getImplicitArgsSize() const { return ImplicitArgsSize; }
 487
 488 private:
 489   /// The kernel object to execute.
 490   uint64_t KernelObject;
 491
 492   /// The args, group and private segments sizes required by a kernel instance.
 493   uint32_t ArgsSize;
 494   uint32_t GroupSize;
 495   uint32_t PrivateSize;
 496
 497   /// The size of implicit kernel arguments.
 498   uint32_t ImplicitArgsSize;
 499
 500   /// Additional Info for the AMD GPU Kernel
 501   std::optional<utils::KernelMetaDataTy> KernelInfo;
 502 };
 503
 504 /// Class representing an HSA signal. Signals are used to define dependencies
 505 /// between asynchronous operations: kernel launches and memory transfers.
 506 struct AMDGPUSignalTy {
 507   /// Create an empty signal.
 508   AMDGPUSignalTy() : HSASignal({0}), UseCount() {}
 509   AMDGPUSignalTy(AMDGPUDeviceTy &Device) : HSASignal({0}), UseCount() {}
 510
 511   /// Initialize the signal with an initial value.
 512   Error init(uint32_t InitialValue = 1) {
 513     hsa_status_t Status =
 514         hsa_amd_signal_create(InitialValue, 0, nullptr, 0, &HSASignal);
 515     return Plugin::check(Status, "Error in hsa_signal_create: %s");
 516   }
 517
 518   /// Deinitialize the signal.
 519   Error deinit() {
 520     hsa_status_t Status = hsa_signal_destroy(HSASignal);
 521     return Plugin::check(Status, "Error in hsa_signal_destroy: %s");
 522   }
 523
 524   /// Wait until the signal gets a zero value.
 525   Error wait(const uint64_t ActiveTimeout = 0, RPCServerTy *RPCServer = nullptr,
 526              GenericDeviceTy *Device = nullptr) const {
 527     if (ActiveTimeout && !RPCServer) {
 528       hsa_signal_value_t Got = 1;
 529       Got = hsa_signal_wait_scacquire(HSASignal, HSA_SIGNAL_CONDITION_EQ, 0,
 530                                       ActiveTimeout, HSA_WAIT_STATE_ACTIVE);
 531       if (Got == 0)
 532         return Plugin::success();
 533     }
 534
 535     // If there is an RPC device attached to this stream we run it as a server.
 536     uint64_t Timeout = RPCServer ? 8192 : UINT64_MAX;
 537     auto WaitState = RPCServer ? HSA_WAIT_STATE_ACTIVE : HSA_WAIT_STATE_BLOCKED;
 538     while (hsa_signal_wait_scacquire(HSASignal, HSA_SIGNAL_CONDITION_EQ, 0,
 539                                      Timeout, WaitState) != 0) {
 540       if (RPCServer && Device)
 541         if (auto Err = RPCServer->runServer(*Device))
 542           return Err;
 543     }
 544     return Plugin::success();
 545   }
 546
 547   /// Load the value on the signal.
 548   hsa_signal_value_t load() const {
 549     return hsa_signal_load_scacquire(HSASignal);
 550   }
 551
 552   /// Signal decrementing by one.
 553   void signal() {
 554     assert(load() > 0 && "Invalid signal value");
 555     hsa_signal_subtract_screlease(HSASignal, 1);
 556   }
 557
 558   /// Reset the signal value before reusing the signal. Do not call this
 559   /// function if the signal is being currently used by any watcher, such as a
 560   /// plugin thread or the HSA runtime.
 561   void reset() { hsa_signal_store_screlease(HSASignal, 1); }
 562
 563   /// Increase the number of concurrent uses.
 564   void increaseUseCount() { UseCount.increase(); }
 565
 566   /// Decrease the number of concurrent uses and return whether was the last.
 567   bool decreaseUseCount() { return UseCount.decrease(); }
 568
 569   hsa_signal_t get() const { return HSASignal; }
 570
 571 private:
 572   /// The underlying HSA signal.
 573   hsa_signal_t HSASignal;
 574
 575   /// Reference counter for tracking the concurrent use count. This is mainly
 576   /// used for knowing how many streams are using the signal.
 577   RefCountTy<> UseCount;
 578 };
 579
 580 /// Classes for holding AMDGPU signals and managing signals.
 581 using AMDGPUSignalRef = AMDGPUResourceRef<AMDGPUSignalTy>;
 582 using AMDGPUSignalManagerTy = GenericDeviceResourceManagerTy<AMDGPUSignalRef>;
 583
 584 /// Class holding an HSA queue to submit kernel and barrier packets.
 585 struct AMDGPUQueueTy {
 586   /// Create an empty queue.
 587   AMDGPUQueueTy() : Queue(nullptr), Mutex(), NumUsers(0) {}
 588
 589   /// Lazily initialize a new queue belonging to a specific agent.
 590   Error init(hsa_agent_t Agent, int32_t QueueSize) {
 591     if (Queue)
 592       return Plugin::success();
 593     hsa_status_t Status =
 594         hsa_queue_create(Agent, QueueSize, HSA_QUEUE_TYPE_MULTI, callbackError,
 595                          nullptr, UINT32_MAX, UINT32_MAX, &Queue);
 596     return Plugin::check(Status, "Error in hsa_queue_create: %s");
 597   }
 598
 599   /// Deinitialize the queue and destroy its resources.
 600   Error deinit() {
 601     std::lock_guard<std::mutex> Lock(Mutex);
 602     if (!Queue)
 603       return Plugin::success();
 604     hsa_status_t Status = hsa_queue_destroy(Queue);
 605     return Plugin::check(Status, "Error in hsa_queue_destroy: %s");
 606   }
 607
 608   /// Returns the number of streams, this queue is currently assigned to.
 609   bool getUserCount() const { return NumUsers; }
 610
 611   /// Returns if the underlying HSA queue is initialized.
 612   bool isInitialized() { return Queue != nullptr; }
 613
 614   /// Decrement user count of the queue object.
 615   void removeUser() { --NumUsers; }
 616
 617   /// Increase user count of the queue object.
 618   void addUser() { ++NumUsers; }
 619
 620   /// Push a kernel launch to the queue. The kernel launch requires an output
 621   /// signal and can define an optional input signal (nullptr if none).
 622   Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
 623                          uint32_t NumThreads, uint64_t NumBlocks,
 624                          uint32_t GroupSize, AMDGPUSignalTy *OutputSignal,
 625                          AMDGPUSignalTy *InputSignal) {
 626     assert(OutputSignal && "Invalid kernel output signal");
 627
 628     // Lock the queue during the packet publishing process. Notice this blocks
 629     // the addition of other packets to the queue. The following piece of code
 630     // should be lightweight; do not block the thread, allocate memory, etc.
 631     std::lock_guard<std::mutex> Lock(Mutex);
 632     assert(Queue && "Interacted with a non-initialized queue!");
 633
 634     // Avoid defining the input dependency if already satisfied.
 635     if (InputSignal && !InputSignal->load())
 636       InputSignal = nullptr;
 637
 638     // Add a barrier packet before the kernel packet in case there is a pending
 639     // preceding operation. The barrier packet will delay the processing of
 640     // subsequent queue's packets until the barrier input signal are satisfied.
 641     // No need output signal needed because the dependency is already guaranteed
 642     // by the queue barrier itself.
 643     if (InputSignal)
 644       if (auto Err = pushBarrierImpl(nullptr, InputSignal))
 645         return Err;
 646
 647     // Now prepare the kernel packet.
 648     uint64_t PacketId;
 649     hsa_kernel_dispatch_packet_t *Packet = acquirePacket(PacketId);
 650     assert(Packet && "Invalid packet");
 651
 652     // The first 32 bits of the packet are written after the other fields
 653     uint16_t Setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
 654     Packet->workgroup_size_x = NumThreads;
 655     Packet->workgroup_size_y = 1;
 656     Packet->workgroup_size_z = 1;
 657     Packet->reserved0 = 0;
 658     Packet->grid_size_x = NumBlocks * NumThreads;
 659     Packet->grid_size_y = 1;
 660     Packet->grid_size_z = 1;
 661     Packet->private_segment_size = Kernel.getPrivateSize();
 662     Packet->group_segment_size = GroupSize;
 663     Packet->kernel_object = Kernel.getKernelObject();
 664     Packet->kernarg_address = KernelArgs;
 665     Packet->reserved2 = 0;
 666     Packet->completion_signal = OutputSignal->get();
 667
 668     // Publish the packet. Do not modify the packet after this point.
 669     publishKernelPacket(PacketId, Setup, Packet);
 670
 671     return Plugin::success();
 672   }
 673
 674   /// Push a barrier packet that will wait up to two input signals. All signals
 675   /// are optional (nullptr if none).
 676   Error pushBarrier(AMDGPUSignalTy *OutputSignal,
 677                     const AMDGPUSignalTy *InputSignal1,
 678                     const AMDGPUSignalTy *InputSignal2) {
 679     // Lock the queue during the packet publishing process.
 680     std::lock_guard<std::mutex> Lock(Mutex);
 681     assert(Queue && "Interacted with a non-initialized queue!");
 682
 683     // Push the barrier with the lock acquired.
 684     return pushBarrierImpl(OutputSignal, InputSignal1, InputSignal2);
 685   }
 686
 687 private:
 688   /// Push a barrier packet that will wait up to two input signals. Assumes the
 689   /// the queue lock is acquired.
 690   Error pushBarrierImpl(AMDGPUSignalTy *OutputSignal,
 691                         const AMDGPUSignalTy *InputSignal1,
 692                         const AMDGPUSignalTy *InputSignal2 = nullptr) {
 693     // Add a queue barrier waiting on both the other stream's operation and the
 694     // last operation on the current stream (if any).
 695     uint64_t PacketId;
 696     hsa_barrier_and_packet_t *Packet =
 697         (hsa_barrier_and_packet_t *)acquirePacket(PacketId);
 698     assert(Packet && "Invalid packet");
 699
 700     Packet->reserved0 = 0;
 701     Packet->reserved1 = 0;
 702     Packet->dep_signal[0] = {0};
 703     Packet->dep_signal[1] = {0};
 704     Packet->dep_signal[2] = {0};
 705     Packet->dep_signal[3] = {0};
 706     Packet->dep_signal[4] = {0};
 707     Packet->reserved2 = 0;
 708     Packet->completion_signal = {0};
 709
 710     // Set input and output dependencies if needed.
 711     if (OutputSignal)
 712       Packet->completion_signal = OutputSignal->get();
 713     if (InputSignal1)
 714       Packet->dep_signal[0] = InputSignal1->get();
 715     if (InputSignal2)
 716       Packet->dep_signal[1] = InputSignal2->get();
 717
 718     // Publish the packet. Do not modify the packet after this point.
 719     publishBarrierPacket(PacketId, Packet);
 720
 721     return Plugin::success();
 722   }
 723
 724   /// Acquire a packet from the queue. This call may block the thread if there
 725   /// is no space in the underlying HSA queue. It may need to wait until the HSA
 726   /// runtime processes some packets. Assumes the queue lock is acquired.
 727   hsa_kernel_dispatch_packet_t *acquirePacket(uint64_t &PacketId) {
 728     // Increase the queue index with relaxed memory order. Notice this will need
 729     // another subsequent atomic operation with acquire order.
 730     PacketId = hsa_queue_add_write_index_relaxed(Queue, 1);
 731
 732     // Wait for the package to be available. Notice the atomic operation uses
 733     // the acquire memory order.
 734     while (PacketId - hsa_queue_load_read_index_scacquire(Queue) >= Queue->size)
 735       ;
 736
 737     // Return the packet reference.
 738     const uint32_t Mask = Queue->size - 1; // The size is a power of 2.
 739     return (hsa_kernel_dispatch_packet_t *)Queue->base_address +
 740            (PacketId & Mask);
 741   }
 742
 743   /// Publish the kernel packet so that the HSA runtime can start processing
 744   /// the kernel launch. Do not modify the packet once this function is called.
 745   /// Assumes the queue lock is acquired.
 746   void publishKernelPacket(uint64_t PacketId, uint16_t Setup,
 747                            hsa_kernel_dispatch_packet_t *Packet) {
 748     uint32_t *PacketPtr = reinterpret_cast<uint32_t *>(Packet);
 749
 750     uint16_t Header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
 751     Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
 752     Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
 753
 754     // Publish the packet. Do not modify the package after this point.
 755     uint32_t HeaderWord = Header | (Setup << 16u);
 756     __atomic_store_n(PacketPtr, HeaderWord, __ATOMIC_RELEASE);
 757
 758     // Signal the doorbell about the published packet.
 759     hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId);
 760   }
 761
 762   /// Publish the barrier packet so that the HSA runtime can start processing
 763   /// the barrier. Next packets in the queue will not be processed until all
 764   /// barrier dependencies (signals) are satisfied. Assumes the queue is locked
 765   void publishBarrierPacket(uint64_t PacketId,
 766                             hsa_barrier_and_packet_t *Packet) {
 767     uint32_t *PacketPtr = reinterpret_cast<uint32_t *>(Packet);
 768     uint16_t Setup = 0;
 769     uint16_t Header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE;
 770     Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
 771     Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
 772
 773     // Publish the packet. Do not modify the package after this point.
 774     uint32_t HeaderWord = Header | (Setup << 16u);
 775     __atomic_store_n(PacketPtr, HeaderWord, __ATOMIC_RELEASE);
 776
 777     // Signal the doorbell about the published packet.
 778     hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId);
 779   }
 780
 781   /// Callack that will be called when an error is detected on the HSA queue.
 782   static void callbackError(hsa_status_t Status, hsa_queue_t *Source, void *) {
 783     auto Err = Plugin::check(Status, "Received error in queue %p: %s", Source);
 784     FATAL_MESSAGE(1, "%s", toString(std::move(Err)).data());
 785   }
 786
 787   /// The HSA queue.
 788   hsa_queue_t *Queue;
 789
 790   /// Mutex to protect the acquiring and publishing of packets. For the moment,
 791   /// we need this mutex to prevent publishing packets that are not ready to be
 792   /// published in a multi-thread scenario. Without a queue lock, a thread T1
 793   /// could acquire packet P and thread T2 acquire packet P+1. Thread T2 could
 794   /// publish its packet P+1 (signaling the queue's doorbell) before packet P
 795   /// from T1 is ready to be processed. That scenario should be invalid. Thus,
 796   /// we use the following mutex to make packet acquiring and publishing atomic.
 797   /// TODO: There are other more advanced approaches to avoid this mutex using
 798   /// atomic operations. We can further investigate it if this is a bottleneck.
 799   std::mutex Mutex;
 800
 801   /// The number of streams, this queue is currently assigned to. A queue is
 802   /// considered idle when this is zero, otherwise: busy.
 803   uint32_t NumUsers;
 804 };
 805
 806 /// Struct that implements a stream of asynchronous operations for AMDGPU
 807 /// devices. This class relies on signals to implement streams and define the
 808 /// dependencies between asynchronous operations.
 809 struct AMDGPUStreamTy {
 810 private:
 811   /// Utility struct holding arguments for async H2H memory copies.
 812   struct MemcpyArgsTy {
 813     void *Dst;
 814     const void *Src;
 815     size_t Size;
 816   };
 817
 818   /// Utility struct holding arguments for freeing buffers to memory managers.
 819   struct ReleaseBufferArgsTy {
 820     void *Buffer;
 821     AMDGPUMemoryManagerTy *MemoryManager;
 822   };
 823
 824   /// Utility struct holding arguments for releasing signals to signal managers.
 825   struct ReleaseSignalArgsTy {
 826     AMDGPUSignalTy *Signal;
 827     AMDGPUSignalManagerTy *SignalManager;
 828   };
 829
 830   /// The stream is composed of N stream's slots. The struct below represents
 831   /// the fields of each slot. Each slot has a signal and an optional action
 832   /// function. When appending an HSA asynchronous operation to the stream, one
 833   /// slot is consumed and used to store the operation's information. The
 834   /// operation's output signal is set to the consumed slot's signal. If there
 835   /// is a previous asynchronous operation on the previous slot, the HSA async
 836   /// operation's input signal is set to the signal of the previous slot. This
 837   /// way, we obtain a chain of dependant async operations. The action is a
 838   /// function that will be executed eventually after the operation is
 839   /// completed, e.g., for releasing a buffer.
 840   struct StreamSlotTy {
 841     /// The output signal of the stream operation. May be used by the subsequent
 842     /// operation as input signal.
 843     AMDGPUSignalTy *Signal;
 844
 845     /// The action that must be performed after the operation's completion. Set
 846     /// to nullptr when there is no action to perform.
 847     Error (*ActionFunction)(void *);
 848
 849     /// Space for the action's arguments. A pointer to these arguments is passed
 850     /// to the action function. Notice the space of arguments is limited.
 851     union {
 852       MemcpyArgsTy MemcpyArgs;
 853       ReleaseBufferArgsTy ReleaseBufferArgs;
 854       ReleaseSignalArgsTy ReleaseSignalArgs;
 855     } ActionArgs;
 856
 857     /// Create an empty slot.
 858     StreamSlotTy() : Signal(nullptr), ActionFunction(nullptr) {}
 859
 860     /// Schedule a host memory copy action on the slot.
 861     Error schedHostMemoryCopy(void *Dst, const void *Src, size_t Size) {
 862       ActionFunction = memcpyAction;
 863       ActionArgs.MemcpyArgs = MemcpyArgsTy{Dst, Src, Size};
 864       return Plugin::success();
 865     }
 866
 867     /// Schedule a release buffer action on the slot.
 868     Error schedReleaseBuffer(void *Buffer, AMDGPUMemoryManagerTy &Manager) {
 869       ActionFunction = releaseBufferAction;
 870       ActionArgs.ReleaseBufferArgs = ReleaseBufferArgsTy{Buffer, &Manager};
 871       return Plugin::success();
 872     }
 873
 874     /// Schedule a signal release action on the slot.
 875     Error schedReleaseSignal(AMDGPUSignalTy *SignalToRelease,
 876                              AMDGPUSignalManagerTy *SignalManager) {
 877       ActionFunction = releaseSignalAction;
 878       ActionArgs.ReleaseSignalArgs =
 879           ReleaseSignalArgsTy{SignalToRelease, SignalManager};
 880       return Plugin::success();
 881     }
 882
 883     // Perform the action if needed.
 884     Error performAction() {
 885       if (!ActionFunction)
 886         return Plugin::success();
 887
 888       // Perform the action.
 889       if (ActionFunction == memcpyAction) {
 890         if (auto Err = memcpyAction(&ActionArgs))
 891           return Err;
 892       } else if (ActionFunction == releaseBufferAction) {
 893         if (auto Err = releaseBufferAction(&ActionArgs))
 894           return Err;
 895       } else if (ActionFunction == releaseSignalAction) {
 896         if (auto Err = releaseSignalAction(&ActionArgs))
 897           return Err;
 898       } else {
 899         return Plugin::error("Unknown action function!");
 900       }
 901
 902       // Invalidate the action.
 903       ActionFunction = nullptr;
 904
 905       return Plugin::success();
 906     }
 907   };
 908
 909   /// The device agent where the stream was created.
 910   hsa_agent_t Agent;
 911
 912   /// The queue that the stream uses to launch kernels.
 913   AMDGPUQueueTy *Queue;
 914
 915   /// The manager of signals to reuse signals.
 916   AMDGPUSignalManagerTy &SignalManager;
 917
 918   /// A reference to the associated device.
 919   GenericDeviceTy &Device;
 920
 921   /// Array of stream slots. Use std::deque because it can dynamically grow
 922   /// without invalidating the already inserted elements. For instance, the
 923   /// std::vector may invalidate the elements by reallocating the internal
 924   /// array if there is not enough space on new insertions.
 925   std::deque<StreamSlotTy> Slots;
 926
 927   /// The next available slot on the queue. This is reset to zero each time the
 928   /// stream is synchronized. It also indicates the current number of consumed
 929   /// slots at a given time.
 930   uint32_t NextSlot;
 931
 932   /// The synchronization id. This number is increased each time the stream is
 933   /// synchronized. It is useful to detect if an AMDGPUEventTy points to an
 934   /// operation that was already finalized in a previous stream sycnhronize.
 935   uint32_t SyncCycle;
 936
 937   /// A pointer associated with an RPC server running on the given device. If
 938   /// RPC is not being used this will be a null pointer. Otherwise, this
 939   /// indicates that an RPC server is expected to be run on this stream.
 940   RPCServerTy *RPCServer;
 941
 942   /// Mutex to protect stream's management.
 943   mutable std::mutex Mutex;
 944
 945   /// Timeout hint for HSA actively waiting for signal value to change
 946   const uint64_t StreamBusyWaitMicroseconds;
 947
 948   /// Return the current number of asychronous operations on the stream.
 949   uint32_t size() const { return NextSlot; }
 950
 951   /// Return the last valid slot on the stream.
 952   uint32_t last() const { return size() - 1; }
 953
 954   /// Consume one slot from the stream. Since the stream uses signals on demand
 955   /// and releases them once the slot is no longer used, the function requires
 956   /// an idle signal for the new consumed slot.
 957   std::pair<uint32_t, AMDGPUSignalTy *> consume(AMDGPUSignalTy *OutputSignal) {
 958     // Double the stream size if needed. Since we use std::deque, this operation
 959     // does not invalidate the already added slots.
 960     if (Slots.size() == NextSlot)
 961       Slots.resize(Slots.size() * 2);
 962
 963     // Update the next available slot and the stream size.
 964     uint32_t Curr = NextSlot++;
 965
 966     // Retrieve the input signal, if any, of the current operation.
 967     AMDGPUSignalTy *InputSignal = (Curr > 0) ? Slots[Curr - 1].Signal : nullptr;
 968
 969     // Set the output signal of the current slot.
 970     Slots[Curr].Signal = OutputSignal;
 971
 972     return std::make_pair(Curr, InputSignal);
 973   }
 974
 975   /// Complete all pending post actions and reset the stream after synchronizing
 976   /// or positively querying the stream.
 977   Error complete() {
 978     for (uint32_t Slot = 0; Slot < NextSlot; ++Slot) {
 979       // Take the post action of the operation if any.
 980       if (auto Err = Slots[Slot].performAction())
 981         return Err;
 982
 983       // Release the slot's signal if possible. Otherwise, another user will.
 984       if (Slots[Slot].Signal->decreaseUseCount())
 985         if (auto Err = SignalManager.returnResource(Slots[Slot].Signal))
 986           return Err;
 987
 988       Slots[Slot].Signal = nullptr;
 989     }
 990
 991     // Reset the stream slots to zero.
 992     NextSlot = 0;
 993
 994     // Increase the synchronization id since the stream completed a sync cycle.
 995     SyncCycle += 1;
 996
 997     return Plugin::success();
 998   }
 999
1000   /// Make the current stream wait on a specific operation of another stream.
1001   /// The idea is to make the current stream waiting on two signals: 1) the last
1002   /// signal of the current stream, and 2) the last signal of the other stream.
1003   /// Use a barrier packet with two input signals.
1004   Error waitOnStreamOperation(AMDGPUStreamTy &OtherStream, uint32_t Slot) {
1005     if (Queue == nullptr)
1006       return Plugin::error("Target queue was nullptr");
1007
1008     /// The signal that we must wait from the other stream.
1009     AMDGPUSignalTy *OtherSignal = OtherStream.Slots[Slot].Signal;
1010
1011     // Prevent the release of the other stream's signal.
1012     OtherSignal->increaseUseCount();
1013
1014     // Retrieve an available signal for the operation's output.
1015     AMDGPUSignalTy *OutputSignal = nullptr;
1016     if (auto Err = SignalManager.getResource(OutputSignal))
1017       return Err;
1018     OutputSignal->reset();
1019     OutputSignal->increaseUseCount();
1020
1021     // Consume stream slot and compute dependencies.
1022     auto [Curr, InputSignal] = consume(OutputSignal);
1023
1024     // Setup the post action to release the signal.
1025     if (auto Err = Slots[Curr].schedReleaseSignal(OtherSignal, &SignalManager))
1026       return Err;
1027
1028     // Push a barrier into the queue with both input signals.
1029     return Queue->pushBarrier(OutputSignal, InputSignal, OtherSignal);
1030   }
1031
1032   /// Callback for running a specific asynchronous operation. This callback is
1033   /// used for hsa_amd_signal_async_handler. The argument is the operation that
1034   /// should be executed. Notice we use the post action mechanism to codify the
1035   /// asynchronous operation.
1036   static bool asyncActionCallback(hsa_signal_value_t Value, void *Args) {
1037     StreamSlotTy *Slot = reinterpret_cast<StreamSlotTy *>(Args);
1038     assert(Slot && "Invalid slot");
1039     assert(Slot->Signal && "Invalid signal");
1040
1041     // This thread is outside the stream mutex. Make sure the thread sees the
1042     // changes on the slot.
1043     std::atomic_thread_fence(std::memory_order_acquire);
1044
1045     // Peform the operation.
1046     if (auto Err = Slot->performAction())
1047       FATAL_MESSAGE(1, "Error peforming post action: %s",
1048                     toString(std::move(Err)).data());
1049
1050     // Signal the output signal to notify the asycnhronous operation finalized.
1051     Slot->Signal->signal();
1052
1053     // Unregister callback.
1054     return false;
1055   }
1056
1057   // Callback for host-to-host memory copies. This is an asynchronous action.
1058   static Error memcpyAction(void *Data) {
1059     MemcpyArgsTy *Args = reinterpret_cast<MemcpyArgsTy *>(Data);
1060     assert(Args && "Invalid arguments");
1061     assert(Args->Dst && "Invalid destination buffer");
1062     assert(Args->Src && "Invalid source buffer");
1063
1064     std::memcpy(Args->Dst, Args->Src, Args->Size);
1065
1066     return Plugin::success();
1067   }
1068
1069   /// Releasing a memory buffer to a memory manager. This is a post completion
1070   /// action. There are two kinds of memory buffers:
1071   ///   1. For kernel arguments. This buffer can be freed after receiving the
1072   ///   kernel completion signal.
1073   ///   2. For H2D tranfers that need pinned memory space for staging. This
1074   ///   buffer can be freed after receiving the transfer completion signal.
1075   ///   3. For D2H tranfers that need pinned memory space for staging. This
1076   ///   buffer cannot be freed after receiving the transfer completion signal
1077   ///   because of the following asynchronous H2H callback.
1078   ///      For this reason, This action can only be taken at
1079   ///      AMDGPUStreamTy::complete()
1080   /// Because of the case 3, all releaseBufferActions are taken at
1081   /// AMDGPUStreamTy::complete() in the current implementation.
1082   static Error releaseBufferAction(void *Data) {
1083     ReleaseBufferArgsTy *Args = reinterpret_cast<ReleaseBufferArgsTy *>(Data);
1084     assert(Args && "Invalid arguments");
1085     assert(Args->MemoryManager && "Invalid memory manager");
1086     assert(Args->Buffer && "Invalid buffer");
1087
1088     // Release the allocation to the memory manager.
1089     return Args->MemoryManager->deallocate(Args->Buffer);
1090   }
1091
1092   /// Releasing a signal object back to SignalManager. This is a post completion
1093   /// action. This action can only be taken at AMDGPUStreamTy::complete()
1094   static Error releaseSignalAction(void *Data) {
1095     ReleaseSignalArgsTy *Args = reinterpret_cast<ReleaseSignalArgsTy *>(Data);
1096     assert(Args && "Invalid arguments");
1097     assert(Args->Signal && "Invalid signal");
1098     assert(Args->SignalManager && "Invalid signal manager");
1099
1100     // Release the signal if needed.
1101     if (Args->Signal->decreaseUseCount())
1102       if (auto Err = Args->SignalManager->returnResource(Args->Signal))
1103         return Err;
1104
1105     return Plugin::success();
1106   }
1107
1108 public:
1109   /// Create an empty stream associated with a specific device.
1110   AMDGPUStreamTy(AMDGPUDeviceTy &Device);
1111
1112   /// Intialize the stream's signals.
1113   Error init() { return Plugin::success(); }
1114
1115   /// Deinitialize the stream's signals.
1116   Error deinit() { return Plugin::success(); }
1117
1118   /// Attach an RPC server to this stream.
1119   void setRPCServer(RPCServerTy *Server) { RPCServer = Server; }
1120
1121   /// Push a asynchronous kernel to the stream. The kernel arguments must be
1122   /// placed in a special allocation for kernel args and must keep alive until
1123   /// the kernel finalizes. Once the kernel is finished, the stream will release
1124   /// the kernel args buffer to the specified memory manager.
1125   Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
1126                          uint32_t NumThreads, uint64_t NumBlocks,
1127                          uint32_t GroupSize,
1128                          AMDGPUMemoryManagerTy &MemoryManager) {
1129     if (Queue == nullptr)
1130       return Plugin::error("Target queue was nullptr");
1131
1132     // Retrieve an available signal for the operation's output.
1133     AMDGPUSignalTy *OutputSignal = nullptr;
1134     if (auto Err = SignalManager.getResource(OutputSignal))
1135       return Err;
1136     OutputSignal->reset();
1137     OutputSignal->increaseUseCount();
1138
1139     std::lock_guard<std::mutex> StreamLock(Mutex);
1140
1141     // Consume stream slot and compute dependencies.
1142     auto [Curr, InputSignal] = consume(OutputSignal);
1143
1144     // Setup the post action to release the kernel args buffer.
1145     if (auto Err = Slots[Curr].schedReleaseBuffer(KernelArgs, MemoryManager))
1146       return Err;
1147
1148     // Push the kernel with the output signal and an input signal (optional)
1149     return Queue->pushKernelLaunch(Kernel, KernelArgs, NumThreads, NumBlocks,
1150                                    GroupSize, OutputSignal, InputSignal);
1151   }
1152
1153   /// Push an asynchronous memory copy between pinned memory buffers.
1154   Error pushPinnedMemoryCopyAsync(void *Dst, const void *Src,
1155                                   uint64_t CopySize) {
1156     // Retrieve an available signal for the operation's output.
1157     AMDGPUSignalTy *OutputSignal = nullptr;
1158     if (auto Err = SignalManager.getResource(OutputSignal))
1159       return Err;
1160     OutputSignal->reset();
1161     OutputSignal->increaseUseCount();
1162
1163     std::lock_guard<std::mutex> Lock(Mutex);
1164
1165     // Consume stream slot and compute dependencies.
1166     auto [Curr, InputSignal] = consume(OutputSignal);
1167
1168     // Avoid defining the input dependency if already satisfied.
1169     if (InputSignal && !InputSignal->load())
1170       InputSignal = nullptr;
1171
1172     // Issue the async memory copy.
1173     hsa_status_t Status;
1174     if (InputSignal) {
1175       hsa_signal_t InputSignalRaw = InputSignal->get();
1176       Status = hsa_amd_memory_async_copy(Dst, Agent, Src, Agent, CopySize, 1,
1177                                          &InputSignalRaw, OutputSignal->get());
1178     } else
1179       Status = hsa_amd_memory_async_copy(Dst, Agent, Src, Agent, CopySize, 0,
1180                                          nullptr, OutputSignal->get());
1181     return Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s");
1182   }
1183
1184   /// Push an asynchronous memory copy device-to-host involving an unpinned
1185   /// memory buffer. The operation consists of a two-step copy from the
1186   /// device buffer to an intermediate pinned host buffer, and then, to a
1187   /// unpinned host buffer. Both operations are asynchronous and dependant.
1188   /// The intermediate pinned buffer will be released to the specified memory
1189   /// manager once the operation completes.
1190   Error pushMemoryCopyD2HAsync(void *Dst, const void *Src, void *Inter,
1191                                uint64_t CopySize,
1192                                AMDGPUMemoryManagerTy &MemoryManager) {
1193     // Retrieve available signals for the operation's outputs.
1194     AMDGPUSignalTy *OutputSignals[2] = {};
1195     if (auto Err = SignalManager.getResources(/*Num=*/2, OutputSignals))
1196       return Err;
1197     for (auto Signal : OutputSignals) {
1198       Signal->reset();
1199       Signal->increaseUseCount();
1200     }
1201
1202     std::lock_guard<std::mutex> Lock(Mutex);
1203
1204     // Consume stream slot and compute dependencies.
1205     auto [Curr, InputSignal] = consume(OutputSignals[0]);
1206
1207     // Avoid defining the input dependency if already satisfied.
1208     if (InputSignal && !InputSignal->load())
1209       InputSignal = nullptr;
1210
1211     // Setup the post action for releasing the intermediate buffer.
1212     if (auto Err = Slots[Curr].schedReleaseBuffer(Inter, MemoryManager))
1213       return Err;
1214
1215     // Issue the first step: device to host transfer. Avoid defining the input
1216     // dependency if already satisfied.
1217     hsa_status_t Status;
1218     if (InputSignal) {
1219       hsa_signal_t InputSignalRaw = InputSignal->get();
1220       Status =
1221           hsa_amd_memory_async_copy(Inter, Agent, Src, Agent, CopySize, 1,
1222                                     &InputSignalRaw, OutputSignals[0]->get());
1223     } else {
1224       Status = hsa_amd_memory_async_copy(Inter, Agent, Src, Agent, CopySize, 0,
1225                                          nullptr, OutputSignals[0]->get());
1226     }
1227
1228     if (auto Err =
1229             Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s"))
1230       return Err;
1231
1232     // Consume another stream slot and compute dependencies.
1233     std::tie(Curr, InputSignal) = consume(OutputSignals[1]);
1234     assert(InputSignal && "Invalid input signal");
1235
1236     // The std::memcpy is done asynchronously using an async handler. We store
1237     // the function's information in the action but it's not actually an action.
1238     if (auto Err = Slots[Curr].schedHostMemoryCopy(Dst, Inter, CopySize))
1239       return Err;
1240
1241     // Make changes on this slot visible to the async handler's thread.
1242     std::atomic_thread_fence(std::memory_order_release);
1243
1244     // Issue the second step: host to host transfer.
1245     Status = hsa_amd_signal_async_handler(
1246         InputSignal->get(), HSA_SIGNAL_CONDITION_EQ, 0, asyncActionCallback,
1247         (void *)&Slots[Curr]);
1248
1249     return Plugin::check(Status, "Error in hsa_amd_signal_async_handler: %s");
1250   }
1251
1252   /// Push an asynchronous memory copy host-to-device involving an unpinned
1253   /// memory buffer. The operation consists of a two-step copy from the
1254   /// unpinned host buffer to an intermediate pinned host buffer, and then, to
1255   /// the pinned host buffer. Both operations are asynchronous and dependant.
1256   /// The intermediate pinned buffer will be released to the specified memory
1257   /// manager once the operation completes.
1258   Error pushMemoryCopyH2DAsync(void *Dst, const void *Src, void *Inter,
1259                                uint64_t CopySize,
1260                                AMDGPUMemoryManagerTy &MemoryManager) {
1261     // Retrieve available signals for the operation's outputs.
1262     AMDGPUSignalTy *OutputSignals[2] = {};
1263     if (auto Err = SignalManager.getResources(/*Num=*/2, OutputSignals))
1264       return Err;
1265     for (auto Signal : OutputSignals) {
1266       Signal->reset();
1267       Signal->increaseUseCount();
1268     }
1269
1270     AMDGPUSignalTy *OutputSignal = OutputSignals[0];
1271
1272     std::lock_guard<std::mutex> Lock(Mutex);
1273
1274     // Consume stream slot and compute dependencies.
1275     auto [Curr, InputSignal] = consume(OutputSignal);
1276
1277     // Avoid defining the input dependency if already satisfied.
1278     if (InputSignal && !InputSignal->load())
1279       InputSignal = nullptr;
1280
1281     // Issue the first step: host to host transfer.
1282     if (InputSignal) {
1283       // The std::memcpy is done asynchronously using an async handler. We store
1284       // the function's information in the action but it is not actually a
1285       // post action.
1286       if (auto Err = Slots[Curr].schedHostMemoryCopy(Inter, Src, CopySize))
1287         return Err;
1288
1289       // Make changes on this slot visible to the async handler's thread.
1290       std::atomic_thread_fence(std::memory_order_release);
1291
1292       hsa_status_t Status = hsa_amd_signal_async_handler(
1293           InputSignal->get(), HSA_SIGNAL_CONDITION_EQ, 0, asyncActionCallback,
1294           (void *)&Slots[Curr]);
1295
1296       if (auto Err = Plugin::check(Status,
1297                                    "Error in hsa_amd_signal_async_handler: %s"))
1298         return Err;
1299
1300       // Let's use now the second output signal.
1301       OutputSignal = OutputSignals[1];
1302
1303       // Consume another stream slot and compute dependencies.
1304       std::tie(Curr, InputSignal) = consume(OutputSignal);
1305     } else {
1306       // All preceding operations completed, copy the memory synchronously.
1307       std::memcpy(Inter, Src, CopySize);
1308
1309       // Return the second signal because it will not be used.
1310       OutputSignals[1]->decreaseUseCount();
1311       if (auto Err = SignalManager.returnResource(OutputSignals[1]))
1312         return Err;
1313     }
1314
1315     // Setup the post action to release the intermediate pinned buffer.
1316     if (auto Err = Slots[Curr].schedReleaseBuffer(Inter, MemoryManager))
1317       return Err;
1318
1319     // Issue the second step: host to device transfer. Avoid defining the input
1320     // dependency if already satisfied.
1321     hsa_status_t Status;
1322     if (InputSignal && InputSignal->load()) {
1323       hsa_signal_t InputSignalRaw = InputSignal->get();
1324       Status = hsa_amd_memory_async_copy(Dst, Agent, Inter, Agent, CopySize, 1,
1325                                          &InputSignalRaw, OutputSignal->get());
1326     } else
1327       Status = hsa_amd_memory_async_copy(Dst, Agent, Inter, Agent, CopySize, 0,
1328                                          nullptr, OutputSignal->get());
1329
1330     return Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s");
1331   }
1332
1333   // AMDGPUDeviceTy is incomplete here, passing the underlying agent instead
1334   Error pushMemoryCopyD2DAsync(void *Dst, hsa_agent_t DstAgent, const void *Src,
1335                                hsa_agent_t SrcAgent, uint64_t CopySize) {
1336     AMDGPUSignalTy *OutputSignal;
1337     if (auto Err = SignalManager.getResources(/*Num=*/1, &OutputSignal))
1338       return Err;
1339     OutputSignal->reset();
1340     OutputSignal->increaseUseCount();
1341
1342     std::lock_guard<std::mutex> Lock(Mutex);
1343
1344     // Consume stream slot and compute dependencies.
1345     auto [Curr, InputSignal] = consume(OutputSignal);
1346
1347     // Avoid defining the input dependency if already satisfied.
1348     if (InputSignal && !InputSignal->load())
1349       InputSignal = nullptr;
1350
1351     // The agents need to have access to the corresponding memory
1352     // This is presently only true if the pointers were originally
1353     // allocated by this runtime or the caller made the appropriate
1354     // access calls.
1355
1356     hsa_status_t Status;
1357     if (InputSignal && InputSignal->load()) {
1358       hsa_signal_t InputSignalRaw = InputSignal->get();
1359       Status =
1360           hsa_amd_memory_async_copy(Dst, DstAgent, Src, SrcAgent, CopySize, 1,
1361                                     &InputSignalRaw, OutputSignal->get());
1362     } else
1363       Status = hsa_amd_memory_async_copy(Dst, DstAgent, Src, SrcAgent, CopySize,
1364                                          0, nullptr, OutputSignal->get());
1365
1366     return Plugin::check(Status, "Error in D2D hsa_amd_memory_async_copy: %s");
1367   }
1368
1369   /// Synchronize with the stream. The current thread waits until all operations
1370   /// are finalized and it performs the pending post actions (i.e., releasing
1371   /// intermediate buffers).
1372   Error synchronize() {
1373     std::lock_guard<std::mutex> Lock(Mutex);
1374
1375     // No need to synchronize anything.
1376     if (size() == 0)
1377       return Plugin::success();
1378
1379     // Wait until all previous operations on the stream have completed.
1380     if (auto Err = Slots[last()].Signal->wait(StreamBusyWaitMicroseconds,
1381                                               RPCServer, &Device))
1382       return Err;
1383
1384     // Reset the stream and perform all pending post actions.
1385     return complete();
1386   }
1387
1388   /// Query the stream and complete pending post actions if operations finished.
1389   /// Return whether all the operations completed. This operation does not block
1390   /// the calling thread.
1391   Expected<bool> query() {
1392     std::lock_guard<std::mutex> Lock(Mutex);
1393
1394     // No need to query anything.
1395     if (size() == 0)
1396       return true;
1397
1398     // The last operation did not complete yet. Return directly.
1399     if (Slots[last()].Signal->load())
1400       return false;
1401
1402     // Reset the stream and perform all pending post actions.
1403     if (auto Err = complete())
1404       return std::move(Err);
1405
1406     return true;
1407   }
1408
1409   /// Record the state of the stream on an event.
1410   Error recordEvent(AMDGPUEventTy &Event) const;
1411
1412   /// Make the stream wait on an event.
1413   Error waitEvent(const AMDGPUEventTy &Event);
1414
1415   friend struct AMDGPUStreamManagerTy;
1416 };
1417
1418 /// Class representing an event on AMDGPU. The event basically stores some
1419 /// information regarding the state of the recorded stream.
1420 struct AMDGPUEventTy {
1421   /// Create an empty event.
1422   AMDGPUEventTy(AMDGPUDeviceTy &Device)
1423       : RecordedStream(nullptr), RecordedSlot(-1), RecordedSyncCycle(-1) {}
1424
1425   /// Initialize and deinitialize.
1426   Error init() { return Plugin::success(); }
1427   Error deinit() { return Plugin::success(); }
1428
1429   /// Record the state of a stream on the event.
1430   Error record(AMDGPUStreamTy &Stream) {
1431     std::lock_guard<std::mutex> Lock(Mutex);
1432
1433     // Ignore the last recorded stream.
1434     RecordedStream = &Stream;
1435
1436     return Stream.recordEvent(*this);
1437   }
1438
1439   /// Make a stream wait on the current event.
1440   Error wait(AMDGPUStreamTy &Stream) {
1441     std::lock_guard<std::mutex> Lock(Mutex);
1442
1443     if (!RecordedStream)
1444       return Plugin::error("Event does not have any recorded stream");
1445
1446     // Synchronizing the same stream. Do nothing.
1447     if (RecordedStream == &Stream)
1448       return Plugin::success();
1449
1450     // No need to wait anything, the recorded stream already finished the
1451     // corresponding operation.
1452     if (RecordedSlot < 0)
1453       return Plugin::success();
1454
1455     return Stream.waitEvent(*this);
1456   }
1457
1458 protected:
1459   /// The stream registered in this event.
1460   AMDGPUStreamTy *RecordedStream;
1461
1462   /// The recordered operation on the recorded stream.
1463   int64_t RecordedSlot;
1464
1465   /// The sync cycle when the stream was recorded. Used to detect stale events.
1466   int64_t RecordedSyncCycle;
1467
1468   /// Mutex to safely access event fields.
1469   mutable std::mutex Mutex;
1470
1471   friend struct AMDGPUStreamTy;
1472 };
1473
1474 Error AMDGPUStreamTy::recordEvent(AMDGPUEventTy &Event) const {
1475   std::lock_guard<std::mutex> Lock(Mutex);
1476
1477   if (size() > 0) {
1478     // Record the synchronize identifier (to detect stale recordings) and
1479     // the last valid stream's operation.
1480     Event.RecordedSyncCycle = SyncCycle;
1481     Event.RecordedSlot = last();
1482
1483     assert(Event.RecordedSyncCycle >= 0 && "Invalid recorded sync cycle");
1484     assert(Event.RecordedSlot >= 0 && "Invalid recorded slot");
1485   } else {
1486     // The stream is empty, everything already completed, record nothing.
1487     Event.RecordedSyncCycle = -1;
1488     Event.RecordedSlot = -1;
1489   }
1490   return Plugin::success();
1491 }
1492
1493 Error AMDGPUStreamTy::waitEvent(const AMDGPUEventTy &Event) {
1494   // Retrieve the recorded stream on the event.
1495   AMDGPUStreamTy &RecordedStream = *Event.RecordedStream;
1496
1497   std::scoped_lock<std::mutex, std::mutex> Lock(Mutex, RecordedStream.Mutex);
1498
1499   // The recorded stream already completed the operation because the synchronize
1500   // identifier is already outdated.
1501   if (RecordedStream.SyncCycle != (uint32_t)Event.RecordedSyncCycle)
1502     return Plugin::success();
1503
1504   // Again, the recorded stream already completed the operation, the last
1505   // operation's output signal is satisfied.
1506   if (!RecordedStream.Slots[Event.RecordedSlot].Signal->load())
1507     return Plugin::success();
1508
1509   // Otherwise, make the current stream wait on the other stream's operation.
1510   return waitOnStreamOperation(RecordedStream, Event.RecordedSlot);
1511 }
1512
1513 struct AMDGPUStreamManagerTy final
1514     : GenericDeviceResourceManagerTy<AMDGPUResourceRef<AMDGPUStreamTy>> {
1515   using ResourceRef = AMDGPUResourceRef<AMDGPUStreamTy>;
1516   using ResourcePoolTy = GenericDeviceResourceManagerTy<ResourceRef>;
1517
1518   AMDGPUStreamManagerTy(GenericDeviceTy &Device, hsa_agent_t HSAAgent)
1519       : GenericDeviceResourceManagerTy(Device),
1520         OMPX_QueueTracking("LIBOMPTARGET_AMDGPU_HSA_QUEUE_BUSY_TRACKING", true),
1521         NextQueue(0), Agent(HSAAgent) {}
1522
1523   Error init(uint32_t InitialSize, int NumHSAQueues, int HSAQueueSize) {
1524     Queues = std::vector<AMDGPUQueueTy>(NumHSAQueues);
1525     QueueSize = HSAQueueSize;
1526     MaxNumQueues = NumHSAQueues;
1527     // Initialize one queue eagerly
1528     if (auto Err = Queues.front().init(Agent, QueueSize))
1529       return Err;
1530
1531     return GenericDeviceResourceManagerTy::init(InitialSize);
1532   }
1533
1534   /// Deinitialize the resource pool and delete all resources. This function
1535   /// must be called before the destructor.
1536   Error deinit() override {
1537     // De-init all queues
1538     for (AMDGPUQueueTy &Queue : Queues) {
1539       if (auto Err = Queue.deinit())
1540         return Err;
1541     }
1542
1543     return GenericDeviceResourceManagerTy::deinit();
1544   }
1545
1546   /// Get a single stream from the pool or create new resources.
1547   virtual Error getResource(AMDGPUStreamTy *&StreamHandle) override {
1548     return getResourcesImpl(1, &StreamHandle, [this](AMDGPUStreamTy *&Handle) {
1549       return assignNextQueue(Handle);
1550     });
1551   }
1552
1553   /// Return stream to the pool.
1554   virtual Error returnResource(AMDGPUStreamTy *StreamHandle) override {
1555     return returnResourceImpl(StreamHandle, [](AMDGPUStreamTy *Handle) {
1556       Handle->Queue->removeUser();
1557       return Plugin::success();
1558     });
1559   }
1560
1561 private:
1562   /// Search for and assign an prefereably idle queue to the given Stream. If
1563   /// there is no queue without current users, choose the queue with the lowest
1564   /// user count. If utilization is ignored: use round robin selection.
1565   inline Error assignNextQueue(AMDGPUStreamTy *Stream) {
1566     // Start from zero when tracking utilization, otherwise: round robin policy.
1567     uint32_t Index = OMPX_QueueTracking ? 0 : NextQueue++ % MaxNumQueues;
1568
1569     if (OMPX_QueueTracking) {
1570       // Find the least used queue.
1571       for (uint32_t I = 0; I < MaxNumQueues; ++I) {
1572         // Early exit when an initialized queue is idle.
1573         if (Queues[I].isInitialized() && Queues[I].getUserCount() == 0) {
1574           Index = I;
1575           break;
1576         }
1577
1578         // Update the least used queue.
1579         if (Queues[Index].getUserCount() > Queues[I].getUserCount())
1580           Index = I;
1581       }
1582     }
1583
1584     // Make sure the queue is initialized, then add user & assign.
1585     if (auto Err = Queues[Index].init(Agent, QueueSize))
1586       return Err;
1587     Queues[Index].addUser();
1588     Stream->Queue = &Queues[Index];
1589
1590     return Plugin::success();
1591   }
1592
1593   /// Envar for controlling the tracking of busy HSA queues.
1594   BoolEnvar OMPX_QueueTracking;
1595
1596   /// The next queue index to use for round robin selection.
1597   uint32_t NextQueue;
1598
1599   /// The queues which are assigned to requested streams.
1600   std::vector<AMDGPUQueueTy> Queues;
1601
1602   /// The corresponding device as HSA agent.
1603   hsa_agent_t Agent;
1604
1605   /// The maximum number of queues.
1606   int MaxNumQueues;
1607
1608   /// The size of created queues.
1609   int QueueSize;
1610 };
1611
1612 /// Abstract class that holds the common members of the actual kernel devices
1613 /// and the host device. Both types should inherit from this class.
1614 struct AMDGenericDeviceTy {
1615   AMDGenericDeviceTy() {}
1616
1617   virtual ~AMDGenericDeviceTy() {}
1618
1619   /// Create all memory pools which the device has access to and classify them.
1620   Error initMemoryPools() {
1621     // Retrieve all memory pools from the device agent(s).
1622     Error Err = retrieveAllMemoryPools();
1623     if (Err)
1624       return Err;
1625
1626     for (AMDGPUMemoryPoolTy *MemoryPool : AllMemoryPools) {
1627       // Initialize the memory pool and retrieve some basic info.
1628       Error Err = MemoryPool->init();
1629       if (Err)
1630         return Err;
1631
1632       if (!MemoryPool->isGlobal())
1633         continue;
1634
1635       // Classify the memory pools depending on their properties.
1636       if (MemoryPool->isFineGrained()) {
1637         FineGrainedMemoryPools.push_back(MemoryPool);
1638         if (MemoryPool->supportsKernelArgs())
1639           ArgsMemoryPools.push_back(MemoryPool);
1640       } else if (MemoryPool->isCoarseGrained()) {
1641         CoarseGrainedMemoryPools.push_back(MemoryPool);
1642       }
1643     }
1644     return Plugin::success();
1645   }
1646
1647   /// Destroy all memory pools.
1648   Error deinitMemoryPools() {
1649     for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools)
1650       delete Pool;
1651
1652     AllMemoryPools.clear();
1653     FineGrainedMemoryPools.clear();
1654     CoarseGrainedMemoryPools.clear();
1655     ArgsMemoryPools.clear();
1656
1657     return Plugin::success();
1658   }
1659
1660   /// Retrieve and construct all memory pools from the device agent(s).
1661   virtual Error retrieveAllMemoryPools() = 0;
1662
1663   /// Get the device agent.
1664   virtual hsa_agent_t getAgent() const = 0;
1665
1666 protected:
1667   /// Array of all memory pools available to the host agents.
1668   llvm::SmallVector<AMDGPUMemoryPoolTy *> AllMemoryPools;
1669
1670   /// Array of fine-grained memory pools available to the host agents.
1671   llvm::SmallVector<AMDGPUMemoryPoolTy *> FineGrainedMemoryPools;
1672
1673   /// Array of coarse-grained memory pools available to the host agents.
1674   llvm::SmallVector<AMDGPUMemoryPoolTy *> CoarseGrainedMemoryPools;
1675
1676   /// Array of kernel args memory pools available to the host agents.
1677   llvm::SmallVector<AMDGPUMemoryPoolTy *> ArgsMemoryPools;
1678 };
1679
1680 /// Class representing the host device. This host device may have more than one
1681 /// HSA host agent. We aggregate all its resources into the same instance.
1682 struct AMDHostDeviceTy : public AMDGenericDeviceTy {
1683   /// Create a host device from an array of host agents.
1684   AMDHostDeviceTy(const llvm::SmallVector<hsa_agent_t> &HostAgents)
1685       : AMDGenericDeviceTy(), Agents(HostAgents), ArgsMemoryManager(),
1686         PinnedMemoryManager() {
1687     assert(HostAgents.size() && "No host agent found");
1688   }
1689
1690   /// Initialize the host device memory pools and the memory managers for
1691   /// kernel args and host pinned memory allocations.
1692   Error init() {
1693     if (auto Err = initMemoryPools())
1694       return Err;
1695
1696     if (auto Err = ArgsMemoryManager.init(getArgsMemoryPool()))
1697       return Err;
1698
1699     if (auto Err = PinnedMemoryManager.init(getFineGrainedMemoryPool()))
1700       return Err;
1701
1702     return Plugin::success();
1703   }
1704
1705   /// Deinitialize memory pools and managers.
1706   Error deinit() {
1707     if (auto Err = deinitMemoryPools())
1708       return Err;
1709
1710     if (auto Err = ArgsMemoryManager.deinit())
1711       return Err;
1712
1713     if (auto Err = PinnedMemoryManager.deinit())
1714       return Err;
1715
1716     return Plugin::success();
1717   }
1718
1719   /// Retrieve and construct all memory pools from the host agents.
1720   Error retrieveAllMemoryPools() override {
1721     // Iterate through the available pools across the host agents.
1722     for (hsa_agent_t Agent : Agents) {
1723       Error Err = utils::iterateAgentMemoryPools(
1724           Agent, [&](hsa_amd_memory_pool_t HSAMemoryPool) {
1725             AMDGPUMemoryPoolTy *MemoryPool =
1726                 new AMDGPUMemoryPoolTy(HSAMemoryPool);
1727             AllMemoryPools.push_back(MemoryPool);
1728             return HSA_STATUS_SUCCESS;
1729           });
1730       if (Err)
1731         return Err;
1732     }
1733     return Plugin::success();
1734   }
1735
1736   /// Get one of the host agents. Return always the first agent.
1737   hsa_agent_t getAgent() const override { return Agents[0]; }
1738
1739   /// Get a memory pool for fine-grained allocations.
1740   AMDGPUMemoryPoolTy &getFineGrainedMemoryPool() {
1741     assert(!FineGrainedMemoryPools.empty() && "No fine-grained mempool");
1742     // Retrive any memory pool.
1743     return *FineGrainedMemoryPools[0];
1744   }
1745
1746   AMDGPUMemoryPoolTy &getCoarseGrainedMemoryPool() {
1747     assert(!CoarseGrainedMemoryPools.empty() && "No coarse-grained mempool");
1748     // Retrive any memory pool.
1749     return *CoarseGrainedMemoryPools[0];
1750   }
1751
1752   /// Get a memory pool for kernel args allocations.
1753   AMDGPUMemoryPoolTy &getArgsMemoryPool() {
1754     assert(!ArgsMemoryPools.empty() && "No kernelargs mempool");
1755     // Retrieve any memory pool.
1756     return *ArgsMemoryPools[0];
1757   }
1758
1759   /// Getters for kernel args and host pinned memory managers.
1760   AMDGPUMemoryManagerTy &getArgsMemoryManager() { return ArgsMemoryManager; }
1761   AMDGPUMemoryManagerTy &getPinnedMemoryManager() {
1762     return PinnedMemoryManager;
1763   }
1764
1765 private:
1766   /// Array of agents on the host side.
1767   const llvm::SmallVector<hsa_agent_t> Agents;
1768
1769   // Memory manager for kernel arguments.
1770   AMDGPUMemoryManagerTy ArgsMemoryManager;
1771
1772   // Memory manager for pinned memory.
1773   AMDGPUMemoryManagerTy PinnedMemoryManager;
1774 };
1775
1776 /// Class implementing the AMDGPU device functionalities which derives from the
1777 /// generic device class.
1778 struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
1779   // Create an AMDGPU device with a device id and default AMDGPU grid values.
1780   AMDGPUDeviceTy(int32_t DeviceId, int32_t NumDevices,
1781                  AMDHostDeviceTy &HostDevice, hsa_agent_t Agent)
1782       : GenericDeviceTy(DeviceId, NumDevices, {0}), AMDGenericDeviceTy(),
1783         OMPX_NumQueues("LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES", 4),
1784         OMPX_QueueSize("LIBOMPTARGET_AMDGPU_HSA_QUEUE_SIZE", 512),
1785         OMPX_DefaultTeamsPerCU("LIBOMPTARGET_AMDGPU_TEAMS_PER_CU", 4),
1786         OMPX_MaxAsyncCopyBytes("LIBOMPTARGET_AMDGPU_MAX_ASYNC_COPY_BYTES",
1787                                1 * 1024 * 1024), // 1MB
1788         OMPX_InitialNumSignals("LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS",
1789                                64),
1790         OMPX_StreamBusyWait("LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT", 2000000),
1791         AMDGPUStreamManager(*this, Agent), AMDGPUEventManager(*this),
1792         AMDGPUSignalManager(*this), Agent(Agent), HostDevice(HostDevice) {}
1793
1794   ~AMDGPUDeviceTy() {}
1795
1796   /// Initialize the device, its resources and get its properties.
1797   Error initImpl(GenericPluginTy &Plugin) override {
1798     // First setup all the memory pools.
1799     if (auto Err = initMemoryPools())
1800       return Err;
1801
1802     char GPUName[64];
1803     if (auto Err = getDeviceAttr(HSA_AGENT_INFO_NAME, GPUName))
1804       return Err;
1805     ComputeUnitKind = GPUName;
1806
1807     // Get the wavefront size.
1808     uint32_t WavefrontSize = 0;
1809     if (auto Err = getDeviceAttr(HSA_AGENT_INFO_WAVEFRONT_SIZE, WavefrontSize))
1810       return Err;
1811     GridValues.GV_Warp_Size = WavefrontSize;
1812
1813     // Get the frequency of the steady clock.
1814     if (auto Err = getDeviceAttr(HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY,
1815                                  ClockFrequency))
1816       return Err;
1817
1818     // Load the grid values dependending on the wavefront.
1819     if (WavefrontSize == 32)
1820       GridValues = getAMDGPUGridValues<32>();
1821     else if (WavefrontSize == 64)
1822       GridValues = getAMDGPUGridValues<64>();
1823     else
1824       return Plugin::error("Unexpected AMDGPU wavefront %d", WavefrontSize);
1825
1826     // Get maximum number of workitems per workgroup.
1827     uint16_t WorkgroupMaxDim[3];
1828     if (auto Err =
1829             getDeviceAttr(HSA_AGENT_INFO_WORKGROUP_MAX_DIM, WorkgroupMaxDim))
1830       return Err;
1831     GridValues.GV_Max_WG_Size = WorkgroupMaxDim[0];
1832
1833     // Get maximum number of workgroups.
1834     hsa_dim3_t GridMaxDim;
1835     if (auto Err = getDeviceAttr(HSA_AGENT_INFO_GRID_MAX_DIM, GridMaxDim))
1836       return Err;
1837
1838     GridValues.GV_Max_Teams = GridMaxDim.x / GridValues.GV_Max_WG_Size;
1839     if (GridValues.GV_Max_Teams == 0)
1840       return Plugin::error("Maximum number of teams cannot be zero");
1841
1842     // Compute the default number of teams.
1843     uint32_t ComputeUnits = 0;
1844     if (auto Err =
1845             getDeviceAttr(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, ComputeUnits))
1846       return Err;
1847     GridValues.GV_Default_Num_Teams = ComputeUnits * OMPX_DefaultTeamsPerCU;
1848
1849     uint32_t WavesPerCU = 0;
1850     if (auto Err =
1851             getDeviceAttr(HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, WavesPerCU))
1852       return Err;
1853     HardwareParallelism = ComputeUnits * WavesPerCU;
1854
1855     // Get maximum size of any device queues and maximum number of queues.
1856     uint32_t MaxQueueSize;
1857     if (auto Err = getDeviceAttr(HSA_AGENT_INFO_QUEUE_MAX_SIZE, MaxQueueSize))
1858       return Err;
1859
1860     uint32_t MaxQueues;
1861     if (auto Err = getDeviceAttr(HSA_AGENT_INFO_QUEUES_MAX, MaxQueues))
1862       return Err;
1863
1864     // Compute the number of queues and their size.
1865     OMPX_NumQueues = std::max(1U, std::min(OMPX_NumQueues.get(), MaxQueues));
1866     OMPX_QueueSize = std::min(OMPX_QueueSize.get(), MaxQueueSize);
1867
1868     // Initialize stream pool.
1869     if (auto Err = AMDGPUStreamManager.init(OMPX_InitialNumStreams,
1870                                             OMPX_NumQueues, OMPX_QueueSize))
1871       return Err;
1872
1873     // Initialize event pool.
1874     if (auto Err = AMDGPUEventManager.init(OMPX_InitialNumEvents))
1875       return Err;
1876
1877     // Initialize signal pool.
1878     if (auto Err = AMDGPUSignalManager.init(OMPX_InitialNumSignals))
1879       return Err;
1880
1881     return Plugin::success();
1882   }
1883
1884   /// Deinitialize the device and release its resources.
1885   Error deinitImpl() override {
1886     // Deinitialize the stream and event pools.
1887     if (auto Err = AMDGPUStreamManager.deinit())
1888       return Err;
1889
1890     if (auto Err = AMDGPUEventManager.deinit())
1891       return Err;
1892
1893     if (auto Err = AMDGPUSignalManager.deinit())
1894       return Err;
1895
1896     // Close modules if necessary.
1897     if (!LoadedImages.empty()) {
1898       // Each image has its own module.
1899       for (DeviceImageTy *Image : LoadedImages) {
1900         AMDGPUDeviceImageTy &AMDImage =
1901             static_cast<AMDGPUDeviceImageTy &>(*Image);
1902
1903         // Unload the executable of the image.
1904         if (auto Err = AMDImage.unloadExecutable())
1905           return Err;
1906       }
1907     }
1908
1909     // Invalidate agent reference.
1910     Agent = {0};
1911
1912     return Plugin::success();
1913   }
1914
1915   const uint64_t getStreamBusyWaitMicroseconds() const {
1916     return OMPX_StreamBusyWait;
1917   }
1918
1919   Expected<std::unique_ptr<MemoryBuffer>>
1920   doJITPostProcessing(std::unique_ptr<MemoryBuffer> MB) const override {
1921
1922     // TODO: We should try to avoid materialization but there seems to be no
1923     // good linker interface w/o file i/o.
1924     SmallString<128> LinkerOutputFilePath;
1925     std::error_code EC = sys::fs::createTemporaryFile(
1926         "amdgpu-pre-link-jit", ".out", LinkerOutputFilePath);
1927     if (EC)
1928       return createStringError(EC,
1929                                "Failed to create temporary file for linker");
1930
1931     SmallString<128> LinkerInputFilePath = LinkerOutputFilePath;
1932     LinkerInputFilePath.pop_back_n(2);
1933
1934     auto FD = raw_fd_ostream(LinkerInputFilePath.data(), EC);
1935     if (EC)
1936       return createStringError(EC, "Failed to open temporary file for linker");
1937     FD.write(MB->getBufferStart(), MB->getBufferSize());
1938     FD.close();
1939
1940     const auto &ErrorOrPath = sys::findProgramByName("lld");
1941     if (!ErrorOrPath)
1942       return createStringError(inconvertibleErrorCode(),
1943                                "Failed to find `lld` on the PATH.");
1944
1945     std::string LLDPath = ErrorOrPath.get();
1946     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, getDeviceId(),
1947          "Using `%s` to link JITed amdgcn ouput.", LLDPath.c_str());
1948
1949     std::string MCPU = "-plugin-opt=mcpu=" + getComputeUnitKind();
1950
1951     StringRef Args[] = {LLDPath,
1952                         "-flavor",
1953                         "gnu",
1954                         "--no-undefined",
1955                         "-shared",
1956                         MCPU,
1957                         "-o",
1958                         LinkerOutputFilePath.data(),
1959                         LinkerInputFilePath.data()};
1960
1961     std::string Error;
1962     int RC = sys::ExecuteAndWait(LLDPath, Args, std::nullopt, {}, 0, 0, &Error);
1963     if (RC)
1964       return createStringError(inconvertibleErrorCode(),
1965                                "Linking optimized bitcode failed: %s",
1966                                Error.c_str());
1967
1968     return std::move(
1969         MemoryBuffer::getFileOrSTDIN(LinkerOutputFilePath.data()).get());
1970   }
1971
1972   /// See GenericDeviceTy::getComputeUnitKind().
1973   std::string getComputeUnitKind() const override { return ComputeUnitKind; }
1974
1975   /// Returns the clock frequency for the given AMDGPU device.
1976   uint64_t getClockFrequency() const override { return ClockFrequency; }
1977
1978   /// Allocate and construct an AMDGPU kernel.
1979   Expected<GenericKernelTy &>
1980   constructKernel(const __tgt_offload_entry &KernelEntry) override {
1981     // Allocate and construct the AMDGPU kernel.
1982     AMDGPUKernelTy *AMDGPUKernel = Plugin::get().allocate<AMDGPUKernelTy>();
1983     if (!AMDGPUKernel)
1984       return Plugin::error("Failed to allocate memory for AMDGPU kernel");
1985
1986     new (AMDGPUKernel) AMDGPUKernelTy(KernelEntry.name);
1987
1988     return *AMDGPUKernel;
1989   }
1990
1991   /// Set the current context to this device's context. Do nothing since the
1992   /// AMDGPU devices do not have the concept of contexts.
1993   Error setContext() override { return Plugin::success(); }
1994
1995   /// AMDGPU returns the product of the number of compute units and the waves
1996   /// per compute unit.
1997   uint64_t getHardwareParallelism() const override {
1998     return HardwareParallelism;
1999   }
2000
2001   /// We want to set up the RPC server for host services to the GPU if it is
2002   /// availible.
2003   bool shouldSetupRPCServer() const override {
2004     return libomptargetSupportsRPC();
2005   }
2006
2007   /// The RPC interface should have enough space for all availible parallelism.
2008   uint64_t requestedRPCPortCount() const override {
2009     return getHardwareParallelism();
2010   }
2011
2012   /// Get the stream of the asynchronous info sructure or get a new one.
2013   Error getStream(AsyncInfoWrapperTy &AsyncInfoWrapper,
2014                   AMDGPUStreamTy *&Stream) {
2015     // Get the stream (if any) from the async info.
2016     Stream = AsyncInfoWrapper.getQueueAs<AMDGPUStreamTy *>();
2017     if (!Stream) {
2018       // There was no stream; get an idle one.
2019       if (auto Err = AMDGPUStreamManager.getResource(Stream))
2020         return Err;
2021
2022       // Modify the async info's stream.
2023       AsyncInfoWrapper.setQueueAs<AMDGPUStreamTy *>(Stream);
2024     }
2025     return Plugin::success();
2026   }
2027
2028   /// Load the binary image into the device and allocate an image object.
2029   Expected<DeviceImageTy *> loadBinaryImpl(const __tgt_device_image *TgtImage,
2030                                            int32_t ImageId) override {
2031     // Allocate and initialize the image object.
2032     AMDGPUDeviceImageTy *AMDImage =
2033         Plugin::get().allocate<AMDGPUDeviceImageTy>();
2034     new (AMDImage) AMDGPUDeviceImageTy(ImageId, TgtImage);
2035
2036     // Load the HSA executable.
2037     if (Error Err = AMDImage->loadExecutable(*this))
2038       return std::move(Err);
2039
2040     return AMDImage;
2041   }
2042
2043   /// Allocate memory on the device or related to the device.
2044   void *allocate(size_t Size, void *, TargetAllocTy Kind) override;
2045
2046   /// Deallocate memory on the device or related to the device.
2047   int free(void *TgtPtr, TargetAllocTy Kind) override {
2048     if (TgtPtr == nullptr)
2049       return OFFLOAD_SUCCESS;
2050
2051     AMDGPUMemoryPoolTy *MemoryPool = nullptr;
2052     switch (Kind) {
2053     case TARGET_ALLOC_DEFAULT:
2054     case TARGET_ALLOC_DEVICE:
2055       MemoryPool = CoarseGrainedMemoryPools[0];
2056       break;
2057     case TARGET_ALLOC_HOST:
2058       MemoryPool = &HostDevice.getFineGrainedMemoryPool();
2059       break;
2060     case TARGET_ALLOC_SHARED:
2061       MemoryPool = &HostDevice.getFineGrainedMemoryPool();
2062       break;
2063     }
2064
2065     if (!MemoryPool) {
2066       REPORT("No memory pool for the specified allocation kind\n");
2067       return OFFLOAD_FAIL;
2068     }
2069
2070     if (Error Err = MemoryPool->deallocate(TgtPtr)) {
2071       REPORT("%s\n", toString(std::move(Err)).data());
2072       return OFFLOAD_FAIL;
2073     }
2074
2075     return OFFLOAD_SUCCESS;
2076   }
2077
2078   /// Synchronize current thread with the pending operations on the async info.
2079   Error synchronizeImpl(__tgt_async_info &AsyncInfo) override {
2080     AMDGPUStreamTy *Stream =
2081         reinterpret_cast<AMDGPUStreamTy *>(AsyncInfo.Queue);
2082     assert(Stream && "Invalid stream");
2083
2084     if (auto Err = Stream->synchronize())
2085       return Err;
2086
2087     // Once the stream is synchronized, return it to stream pool and reset
2088     // AsyncInfo. This is to make sure the synchronization only works for its
2089     // own tasks.
2090     AsyncInfo.Queue = nullptr;
2091     return AMDGPUStreamManager.returnResource(Stream);
2092   }
2093
2094   /// Query for the completion of the pending operations on the async info.
2095   Error queryAsyncImpl(__tgt_async_info &AsyncInfo) override {
2096     AMDGPUStreamTy *Stream =
2097         reinterpret_cast<AMDGPUStreamTy *>(AsyncInfo.Queue);
2098     assert(Stream && "Invalid stream");
2099
2100     auto CompletedOrErr = Stream->query();
2101     if (!CompletedOrErr)
2102       return CompletedOrErr.takeError();
2103
2104     // Return if it the stream did not complete yet.
2105     if (!(*CompletedOrErr))
2106       return Plugin::success();
2107
2108     // Once the stream is completed, return it to stream pool and reset
2109     // AsyncInfo. This is to make sure the synchronization only works for its
2110     // own tasks.
2111     AsyncInfo.Queue = nullptr;
2112     return AMDGPUStreamManager.returnResource(Stream);
2113   }
2114
2115   /// Pin the host buffer and return the device pointer that should be used for
2116   /// device transfers.
2117   Expected<void *> dataLockImpl(void *HstPtr, int64_t Size) override {
2118     void *PinnedPtr = nullptr;
2119
2120     hsa_status_t Status =
2121         hsa_amd_memory_lock(HstPtr, Size, nullptr, 0, &PinnedPtr);
2122     if (auto Err = Plugin::check(Status, "Error in hsa_amd_memory_lock: %s\n"))
2123       return std::move(Err);
2124
2125     return PinnedPtr;
2126   }
2127
2128   /// Unpin the host buffer.
2129   Error dataUnlockImpl(void *HstPtr) override {
2130     hsa_status_t Status = hsa_amd_memory_unlock(HstPtr);
2131     return Plugin::check(Status, "Error in hsa_amd_memory_unlock: %s\n");
2132   }
2133
2134   /// Check through the HSA runtime whether the \p HstPtr buffer is pinned.
2135   Expected<bool> isPinnedPtrImpl(void *HstPtr, void *&BaseHstPtr,
2136                                  void *&BaseDevAccessiblePtr,
2137                                  size_t &BaseSize) const override {
2138     hsa_amd_pointer_info_t Info;
2139     Info.size = sizeof(hsa_amd_pointer_info_t);
2140
2141     hsa_status_t Status =
2142         hsa_amd_pointer_info(HstPtr, &Info, /* Allocator */ nullptr,
2143                              /* Number of accessible agents (out) */ nullptr,
2144                              /* Accessible agents */ nullptr);
2145     if (auto Err = Plugin::check(Status, "Error in hsa_amd_pointer_info: %s"))
2146       return std::move(Err);
2147
2148     // The buffer may be locked or allocated through HSA allocators. Assume that
2149     // the buffer is host pinned if the runtime reports a HSA type.
2150     if (Info.type != HSA_EXT_POINTER_TYPE_LOCKED &&
2151         Info.type != HSA_EXT_POINTER_TYPE_HSA)
2152       return false;
2153
2154     assert(Info.hostBaseAddress && "Invalid host pinned address");
2155     assert(Info.agentBaseAddress && "Invalid agent pinned address");
2156     assert(Info.sizeInBytes > 0 && "Invalid pinned allocation size");
2157
2158     // Save the allocation info in the output parameters.
2159     BaseHstPtr = Info.hostBaseAddress;
2160     BaseDevAccessiblePtr = Info.agentBaseAddress;
2161     BaseSize = Info.sizeInBytes;
2162
2163     return true;
2164   }
2165
2166   /// Submit data to the device (host to device transfer).
2167   Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size,
2168                        AsyncInfoWrapperTy &AsyncInfoWrapper) override {
2169     AMDGPUStreamTy *Stream = nullptr;
2170     void *PinnedPtr = nullptr;
2171
2172     // Use one-step asynchronous operation when host memory is already pinned.
2173     if (void *PinnedPtr =
2174             PinnedAllocs.getDeviceAccessiblePtrFromPinnedBuffer(HstPtr)) {
2175       if (auto Err = getStream(AsyncInfoWrapper, Stream))
2176         return Err;
2177       return Stream->pushPinnedMemoryCopyAsync(TgtPtr, PinnedPtr, Size);
2178     }
2179
2180     // For large transfers use synchronous behavior.
2181     if (Size >= OMPX_MaxAsyncCopyBytes) {
2182       if (AsyncInfoWrapper.hasQueue())
2183         if (auto Err = synchronize(AsyncInfoWrapper))
2184           return Err;
2185
2186       hsa_status_t Status;
2187       Status = hsa_amd_memory_lock(const_cast<void *>(HstPtr), Size, nullptr, 0,
2188                                    &PinnedPtr);
2189       if (auto Err =
2190               Plugin::check(Status, "Error in hsa_amd_memory_lock: %s\n"))
2191         return Err;
2192
2193       AMDGPUSignalTy Signal;
2194       if (auto Err = Signal.init())
2195         return Err;
2196
2197       Status = hsa_amd_memory_async_copy(TgtPtr, Agent, PinnedPtr, Agent, Size,
2198                                          0, nullptr, Signal.get());
2199       if (auto Err =
2200               Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s"))
2201         return Err;
2202
2203       if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
2204         return Err;
2205
2206       if (auto Err = Signal.deinit())
2207         return Err;
2208
2209       Status = hsa_amd_memory_unlock(const_cast<void *>(HstPtr));
2210       return Plugin::check(Status, "Error in hsa_amd_memory_unlock: %s\n");
2211     }
2212
2213     // Otherwise, use two-step copy with an intermediate pinned host buffer.
2214     AMDGPUMemoryManagerTy &PinnedMemoryManager =
2215         HostDevice.getPinnedMemoryManager();
2216     if (auto Err = PinnedMemoryManager.allocate(Size, &PinnedPtr))
2217       return Err;
2218
2219     if (auto Err = getStream(AsyncInfoWrapper, Stream))
2220       return Err;
2221
2222     return Stream->pushMemoryCopyH2DAsync(TgtPtr, HstPtr, PinnedPtr, Size,
2223                                           PinnedMemoryManager);
2224   }
2225
2226   /// Retrieve data from the device (device to host transfer).
2227   Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size,
2228                          AsyncInfoWrapperTy &AsyncInfoWrapper) override {
2229     AMDGPUStreamTy *Stream = nullptr;
2230     void *PinnedPtr = nullptr;
2231
2232     // Use one-step asynchronous operation when host memory is already pinned.
2233     if (void *PinnedPtr =
2234             PinnedAllocs.getDeviceAccessiblePtrFromPinnedBuffer(HstPtr)) {
2235       if (auto Err = getStream(AsyncInfoWrapper, Stream))
2236         return Err;
2237
2238       return Stream->pushPinnedMemoryCopyAsync(PinnedPtr, TgtPtr, Size);
2239     }
2240
2241     // For large transfers use synchronous behavior.
2242     if (Size >= OMPX_MaxAsyncCopyBytes) {
2243       if (AsyncInfoWrapper.hasQueue())
2244         if (auto Err = synchronize(AsyncInfoWrapper))
2245           return Err;
2246
2247       hsa_status_t Status;
2248       Status = hsa_amd_memory_lock(const_cast<void *>(HstPtr), Size, nullptr, 0,
2249                                    &PinnedPtr);
2250       if (auto Err =
2251               Plugin::check(Status, "Error in hsa_amd_memory_lock: %s\n"))
2252         return Err;
2253
2254       AMDGPUSignalTy Signal;
2255       if (auto Err = Signal.init())
2256         return Err;
2257
2258       Status = hsa_amd_memory_async_copy(PinnedPtr, Agent, TgtPtr, Agent, Size,
2259                                          0, nullptr, Signal.get());
2260       if (auto Err =
2261               Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s"))
2262         return Err;
2263
2264       if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
2265         return Err;
2266
2267       if (auto Err = Signal.deinit())
2268         return Err;
2269
2270       Status = hsa_amd_memory_unlock(const_cast<void *>(HstPtr));
2271       return Plugin::check(Status, "Error in hsa_amd_memory_unlock: %s\n");
2272     }
2273
2274     // Otherwise, use two-step copy with an intermediate pinned host buffer.
2275     AMDGPUMemoryManagerTy &PinnedMemoryManager =
2276         HostDevice.getPinnedMemoryManager();
2277     if (auto Err = PinnedMemoryManager.allocate(Size, &PinnedPtr))
2278       return Err;
2279
2280     if (auto Err = getStream(AsyncInfoWrapper, Stream))
2281       return Err;
2282
2283     return Stream->pushMemoryCopyD2HAsync(HstPtr, TgtPtr, PinnedPtr, Size,
2284                                           PinnedMemoryManager);
2285   }
2286
2287   /// Exchange data between two devices within the plugin.
2288   Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstGenericDevice,
2289                          void *DstPtr, int64_t Size,
2290                          AsyncInfoWrapperTy &AsyncInfoWrapper) override {
2291     AMDGPUDeviceTy &DstDevice = static_cast<AMDGPUDeviceTy &>(DstGenericDevice);
2292
2293     AMDGPUStreamTy *Stream = nullptr;
2294     if (auto Err = getStream(AsyncInfoWrapper, Stream))
2295       return Err;
2296     if (Size <= 0)
2297       return Plugin::success();
2298
2299     return Stream->pushMemoryCopyD2DAsync(DstPtr, DstDevice.getAgent(), SrcPtr,
2300                                           getAgent(), (uint64_t)Size);
2301   }
2302
2303   /// Initialize the async info for interoperability purposes.
2304   Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
2305     // TODO: Implement this function.
2306     return Plugin::success();
2307   }
2308
2309   /// Initialize the device info for interoperability purposes.
2310   Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) override {
2311     DeviceInfo->Context = nullptr;
2312
2313     if (!DeviceInfo->Device)
2314       DeviceInfo->Device = reinterpret_cast<void *>(Agent.handle);
2315
2316     return Plugin::success();
2317   }
2318
2319   /// Create an event.
2320   Error createEventImpl(void **EventPtrStorage) override {
2321     AMDGPUEventTy **Event = reinterpret_cast<AMDGPUEventTy **>(EventPtrStorage);
2322     return AMDGPUEventManager.getResource(*Event);
2323   }
2324
2325   /// Destroy a previously created event.
2326   Error destroyEventImpl(void *EventPtr) override {
2327     AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr);
2328     return AMDGPUEventManager.returnResource(Event);
2329   }
2330
2331   /// Record the event.
2332   Error recordEventImpl(void *EventPtr,
2333                         AsyncInfoWrapperTy &AsyncInfoWrapper) override {
2334     AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr);
2335     assert(Event && "Invalid event");
2336
2337     AMDGPUStreamTy *Stream = nullptr;
2338     if (auto Err = getStream(AsyncInfoWrapper, Stream))
2339       return Err;
2340
2341     return Event->record(*Stream);
2342   }
2343
2344   /// Make the stream wait on the event.
2345   Error waitEventImpl(void *EventPtr,
2346                       AsyncInfoWrapperTy &AsyncInfoWrapper) override {
2347     AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr);
2348
2349     AMDGPUStreamTy *Stream = nullptr;
2350     if (auto Err = getStream(AsyncInfoWrapper, Stream))
2351       return Err;
2352
2353     return Event->wait(*Stream);
2354   }
2355
2356   /// Synchronize the current thread with the event.
2357   Error syncEventImpl(void *EventPtr) override {
2358     return Plugin::error("Synchronize event not implemented");
2359   }
2360
2361   /// Print information about the device.
2362   Error obtainInfoImpl(InfoQueueTy &Info) override {
2363     char TmpChar[1000];
2364     const char *TmpCharPtr = "Unknown";
2365     uint16_t Major, Minor;
2366     uint32_t TmpUInt, TmpUInt2;
2367     uint32_t CacheSize[4];
2368     size_t TmpSt;
2369     bool TmpBool;
2370     uint16_t WorkgrpMaxDim[3];
2371     hsa_dim3_t GridMaxDim;
2372     hsa_status_t Status, Status2;
2373
2374     Status = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &Major);
2375     Status2 = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &Minor);
2376     if (Status == HSA_STATUS_SUCCESS && Status2 == HSA_STATUS_SUCCESS)
2377       Info.add("HSA Runtime Version",
2378                std::to_string(Major) + "." + std::to_string(Minor));
2379
2380     Info.add("HSA OpenMP Device Number", DeviceId);
2381
2382     Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_PRODUCT_NAME, TmpChar);
2383     if (Status == HSA_STATUS_SUCCESS)
2384       Info.add("Product Name", TmpChar);
2385
2386     Status = getDeviceAttrRaw(HSA_AGENT_INFO_NAME, TmpChar);
2387     if (Status == HSA_STATUS_SUCCESS)
2388       Info.add("Device Name", TmpChar);
2389
2390     Status = getDeviceAttrRaw(HSA_AGENT_INFO_VENDOR_NAME, TmpChar);
2391     if (Status == HSA_STATUS_SUCCESS)
2392       Info.add("Vendor Name", TmpChar);
2393
2394     hsa_device_type_t DevType;
2395     Status = getDeviceAttrRaw(HSA_AGENT_INFO_DEVICE, DevType);
2396     if (Status == HSA_STATUS_SUCCESS) {
2397       switch (DevType) {
2398       case HSA_DEVICE_TYPE_CPU:
2399         TmpCharPtr = "CPU";
2400         break;
2401       case HSA_DEVICE_TYPE_GPU:
2402         TmpCharPtr = "GPU";
2403         break;
2404       case HSA_DEVICE_TYPE_DSP:
2405         TmpCharPtr = "DSP";
2406         break;
2407       }
2408       Info.add("Device Type", TmpCharPtr);
2409     }
2410
2411     Status = getDeviceAttrRaw(HSA_AGENT_INFO_QUEUES_MAX, TmpUInt);
2412     if (Status == HSA_STATUS_SUCCESS)
2413       Info.add("Max Queues", TmpUInt);
2414
2415     Status = getDeviceAttrRaw(HSA_AGENT_INFO_QUEUE_MIN_SIZE, TmpUInt);
2416     if (Status == HSA_STATUS_SUCCESS)
2417       Info.add("Queue Min Size", TmpUInt);
2418
2419     Status = getDeviceAttrRaw(HSA_AGENT_INFO_QUEUE_MAX_SIZE, TmpUInt);
2420     if (Status == HSA_STATUS_SUCCESS)
2421       Info.add("Queue Max Size", TmpUInt);
2422
2423     // FIXME: This is deprecated according to HSA documentation. But using
2424     // hsa_agent_iterate_caches and hsa_cache_get_info breaks execution during
2425     // runtime.
2426     Status = getDeviceAttrRaw(HSA_AGENT_INFO_CACHE_SIZE, CacheSize);
2427     if (Status == HSA_STATUS_SUCCESS) {
2428       Info.add("Cache");
2429
2430       for (int I = 0; I < 4; I++)
2431         if (CacheSize[I])
2432           Info.add<InfoLevel2>("L" + std::to_string(I), CacheSize[I]);
2433     }
2434
2435     Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_CACHELINE_SIZE, TmpUInt);
2436     if (Status == HSA_STATUS_SUCCESS)
2437       Info.add("Cacheline Size", TmpUInt);
2438
2439     Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, TmpUInt);
2440     if (Status == HSA_STATUS_SUCCESS)
2441       Info.add("Max Clock Freq", TmpUInt, "MHz");
2442
2443     Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, TmpUInt);
2444     if (Status == HSA_STATUS_SUCCESS)
2445       Info.add("Compute Units", TmpUInt);
2446
2447     Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU, TmpUInt);
2448     if (Status == HSA_STATUS_SUCCESS)
2449       Info.add("SIMD per CU", TmpUInt);
2450
2451     Status = getDeviceAttrRaw(HSA_AGENT_INFO_FAST_F16_OPERATION, TmpBool);
2452     if (Status == HSA_STATUS_SUCCESS)
2453       Info.add("Fast F16 Operation", TmpBool);
2454
2455     Status = getDeviceAttrRaw(HSA_AGENT_INFO_WAVEFRONT_SIZE, TmpUInt2);
2456     if (Status == HSA_STATUS_SUCCESS)
2457       Info.add("Wavefront Size", TmpUInt2);
2458
2459     Status = getDeviceAttrRaw(HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, TmpUInt);
2460     if (Status == HSA_STATUS_SUCCESS)
2461       Info.add("Workgroup Max Size", TmpUInt);
2462
2463     Status = getDeviceAttrRaw(HSA_AGENT_INFO_WORKGROUP_MAX_DIM, WorkgrpMaxDim);
2464     if (Status == HSA_STATUS_SUCCESS) {
2465       Info.add("Workgroup Max Size per Dimension");
2466       Info.add<InfoLevel2>("x", WorkgrpMaxDim[0]);
2467       Info.add<InfoLevel2>("y", WorkgrpMaxDim[1]);
2468       Info.add<InfoLevel2>("z", WorkgrpMaxDim[2]);
2469     }
2470
2471     Status = getDeviceAttrRaw(
2472         (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, TmpUInt);
2473     if (Status == HSA_STATUS_SUCCESS) {
2474       Info.add("Max Waves Per CU", TmpUInt);
2475       Info.add("Max Work-item Per CU", TmpUInt * TmpUInt2);
2476     }
2477
2478     Status = getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_SIZE, TmpUInt);
2479     if (Status == HSA_STATUS_SUCCESS)
2480       Info.add("Grid Max Size", TmpUInt);
2481
2482     Status = getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_DIM, GridMaxDim);
2483     if (Status == HSA_STATUS_SUCCESS) {
2484       Info.add("Grid Max Size per Dimension");
2485       Info.add<InfoLevel2>("x", GridMaxDim.x);
2486       Info.add<InfoLevel2>("y", GridMaxDim.y);
2487       Info.add<InfoLevel2>("z", GridMaxDim.z);
2488     }
2489
2490     Status = getDeviceAttrRaw(HSA_AGENT_INFO_FBARRIER_MAX_SIZE, TmpUInt);
2491     if (Status == HSA_STATUS_SUCCESS)
2492       Info.add("Max fbarriers/Workgrp", TmpUInt);
2493
2494     Info.add("Memory Pools");
2495     for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) {
2496       std::string TmpStr, TmpStr2;
2497
2498       if (Pool->isGlobal())
2499         TmpStr = "Global";
2500       else if (Pool->isReadOnly())
2501         TmpStr = "ReadOnly";
2502       else if (Pool->isPrivate())
2503         TmpStr = "Private";
2504       else if (Pool->isGroup())
2505         TmpStr = "Group";
2506       else
2507         TmpStr = "Unknown";
2508
2509       Info.add<InfoLevel2>(std::string("Pool ") + TmpStr);
2510
2511       if (Pool->isGlobal()) {
2512         if (Pool->isFineGrained())
2513           TmpStr2 += "Fine Grained ";
2514         if (Pool->isCoarseGrained())
2515           TmpStr2 += "Coarse Grained ";
2516         if (Pool->supportsKernelArgs())
2517           TmpStr2 += "Kernarg ";
2518
2519         Info.add<InfoLevel3>("Flags", TmpStr2);
2520       }
2521
2522       Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, TmpSt);
2523       if (Status == HSA_STATUS_SUCCESS)
2524         Info.add<InfoLevel3>("Size", TmpSt, "bytes");
2525
2526       Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED,
2527                                 TmpBool);
2528       if (Status == HSA_STATUS_SUCCESS)
2529         Info.add<InfoLevel3>("Allocatable", TmpBool);
2530
2531       Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE,
2532                                 TmpSt);
2533       if (Status == HSA_STATUS_SUCCESS)
2534         Info.add<InfoLevel3>("Runtime Alloc Granule", TmpSt, "bytes");
2535
2536       Status = Pool->getAttrRaw(
2537           HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT, TmpSt);
2538       if (Status == HSA_STATUS_SUCCESS)
2539         Info.add<InfoLevel3>("Runtime Alloc Alignment", TmpSt, "bytes");
2540
2541       Status =
2542           Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, TmpBool);
2543       if (Status == HSA_STATUS_SUCCESS)
2544         Info.add<InfoLevel3>("Accessable by all", TmpBool);
2545     }
2546
2547     Info.add("ISAs");
2548     auto Err = utils::iterateAgentISAs(getAgent(), [&](hsa_isa_t ISA) {
2549       Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME, TmpChar);
2550       if (Status == HSA_STATUS_SUCCESS)
2551         Info.add<InfoLevel2>("Name", TmpChar);
2552
2553       return Status;
2554     });
2555
2556     // Silently consume the error.
2557     if (Err)
2558       consumeError(std::move(Err));
2559
2560     return Plugin::success();
2561   }
2562
2563   /// Getters and setters for stack and heap sizes.
2564   Error getDeviceStackSize(uint64_t &Value) override {
2565     Value = 0;
2566     return Plugin::success();
2567   }
2568   Error setDeviceStackSize(uint64_t Value) override {
2569     return Plugin::success();
2570   }
2571   Error getDeviceHeapSize(uint64_t &Value) override {
2572     Value = DeviceMemoryPoolSize;
2573     return Plugin::success();
2574   }
2575   Error setDeviceHeapSize(uint64_t Value) override {
2576     for (DeviceImageTy *Image : LoadedImages)
2577       if (auto Err = setupDeviceMemoryPool(Plugin::get(), *Image, Value))
2578         return Err;
2579     DeviceMemoryPoolSize = Value;
2580     return Plugin::success();
2581   }
2582   Error getDeviceMemorySize(uint64_t &Value) override {
2583     for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) {
2584       if (Pool->isGlobal()) {
2585         hsa_status_t Status =
2586             Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, Value);
2587         return Plugin::check(Status, "Error in getting device memory size: %s");
2588       }
2589     }
2590     return Plugin::error("getDeviceMemorySize:: no global pool");
2591   }
2592
2593   /// AMDGPU-specific function to get device attributes.
2594   template <typename Ty> Error getDeviceAttr(uint32_t Kind, Ty &Value) {
2595     hsa_status_t Status =
2596         hsa_agent_get_info(Agent, (hsa_agent_info_t)Kind, &Value);
2597     return Plugin::check(Status, "Error in hsa_agent_get_info: %s");
2598   }
2599
2600   template <typename Ty>
2601   hsa_status_t getDeviceAttrRaw(uint32_t Kind, Ty &Value) {
2602     return hsa_agent_get_info(Agent, (hsa_agent_info_t)Kind, &Value);
2603   }
2604
2605   /// Get the device agent.
2606   hsa_agent_t getAgent() const override { return Agent; }
2607
2608   /// Get the signal manager.
2609   AMDGPUSignalManagerTy &getSignalManager() { return AMDGPUSignalManager; }
2610
2611   /// Retrieve and construct all memory pools of the device agent.
2612   Error retrieveAllMemoryPools() override {
2613     // Iterate through the available pools of the device agent.
2614     return utils::iterateAgentMemoryPools(
2615         Agent, [&](hsa_amd_memory_pool_t HSAMemoryPool) {
2616           AMDGPUMemoryPoolTy *MemoryPool =
2617               Plugin::get().allocate<AMDGPUMemoryPoolTy>();
2618           new (MemoryPool) AMDGPUMemoryPoolTy(HSAMemoryPool);
2619           AllMemoryPools.push_back(MemoryPool);
2620           return HSA_STATUS_SUCCESS;
2621         });
2622   }
2623
2624 private:
2625   using AMDGPUEventRef = AMDGPUResourceRef<AMDGPUEventTy>;
2626   using AMDGPUEventManagerTy = GenericDeviceResourceManagerTy<AMDGPUEventRef>;
2627
2628   /// Envar for controlling the number of HSA queues per device. High number of
2629   /// queues may degrade performance.
2630   UInt32Envar OMPX_NumQueues;
2631
2632   /// Envar for controlling the size of each HSA queue. The size is the number
2633   /// of HSA packets a queue is expected to hold. It is also the number of HSA
2634   /// packets that can be pushed into each queue without waiting the driver to
2635   /// process them.
2636   UInt32Envar OMPX_QueueSize;
2637
2638   /// Envar for controlling the default number of teams relative to the number
2639   /// of compute units (CUs) the device has:
2640   ///   #default_teams = OMPX_DefaultTeamsPerCU * #CUs.
2641   UInt32Envar OMPX_DefaultTeamsPerCU;
2642
2643   /// Envar specifying the maximum size in bytes where the memory copies are
2644   /// asynchronous operations. Up to this transfer size, the memory copies are
2645   /// asychronous operations pushed to the corresponding stream. For larger
2646   /// transfers, they are synchronous transfers.
2647   UInt32Envar OMPX_MaxAsyncCopyBytes;
2648
2649   /// Envar controlling the initial number of HSA signals per device. There is
2650   /// one manager of signals per device managing several pre-allocated signals.
2651   /// These signals are mainly used by AMDGPU streams. If needed, more signals
2652   /// will be created.
2653   UInt32Envar OMPX_InitialNumSignals;
2654
2655   /// Environment variables to set the time to wait in active state before
2656   /// switching to blocked state. The default 2000000 busywaits for 2 seconds
2657   /// before going into a blocking HSA wait state. The unit for these variables
2658   /// are microseconds.
2659   UInt32Envar OMPX_StreamBusyWait;
2660
2661   /// Stream manager for AMDGPU streams.
2662   AMDGPUStreamManagerTy AMDGPUStreamManager;
2663
2664   /// Event manager for AMDGPU events.
2665   AMDGPUEventManagerTy AMDGPUEventManager;
2666
2667   /// Signal manager for AMDGPU signals.
2668   AMDGPUSignalManagerTy AMDGPUSignalManager;
2669
2670   /// The agent handler corresponding to the device.
2671   hsa_agent_t Agent;
2672
2673   /// The GPU architecture.
2674   std::string ComputeUnitKind;
2675
2676   /// The frequency of the steady clock inside the device.
2677   uint64_t ClockFrequency;
2678
2679   /// The total number of concurrent work items that can be running on the GPU.
2680   uint64_t HardwareParallelism;
2681
2682   /// Reference to the host device.
2683   AMDHostDeviceTy &HostDevice;
2684
2685   /// The current size of the global device memory pool (managed by us).
2686   uint64_t DeviceMemoryPoolSize = 1L << 29L /* 512MB */;
2687 };
2688
2689 Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) {
2690   hsa_status_t Status;
2691   Status = hsa_code_object_deserialize(getStart(), getSize(), "", &CodeObject);
2692   if (auto Err =
2693           Plugin::check(Status, "Error in hsa_code_object_deserialize: %s"))
2694     return Err;
2695
2696   Status = hsa_executable_create_alt(
2697       HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO, "", &Executable);
2698   if (auto Err =
2699           Plugin::check(Status, "Error in hsa_executable_create_alt: %s"))
2700     return Err;
2701
2702   Status = hsa_executable_load_code_object(Executable, Device.getAgent(),
2703                                            CodeObject, "");
2704   if (auto Err =
2705           Plugin::check(Status, "Error in hsa_executable_load_code_object: %s"))
2706     return Err;
2707
2708   Status = hsa_executable_freeze(Executable, "");
2709   if (auto Err = Plugin::check(Status, "Error in hsa_executable_freeze: %s"))
2710     return Err;
2711
2712   uint32_t Result;
2713   Status = hsa_executable_validate(Executable, &Result);
2714   if (auto Err = Plugin::check(Status, "Error in hsa_executable_validate: %s"))
2715     return Err;
2716
2717   if (Result)
2718     return Plugin::error("Loaded HSA executable does not validate");
2719
2720   if (auto Err = utils::readAMDGPUMetaDataFromImage(
2721           getMemoryBuffer(), KernelInfoMap, ELFABIVersion))
2722     return Err;
2723
2724   return Plugin::success();
2725 }
2726
2727 Expected<hsa_executable_symbol_t>
2728 AMDGPUDeviceImageTy::findDeviceSymbol(GenericDeviceTy &Device,
2729                                       StringRef SymbolName) const {
2730
2731   AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(Device);
2732   hsa_agent_t Agent = AMDGPUDevice.getAgent();
2733
2734   hsa_executable_symbol_t Symbol;
2735   hsa_status_t Status = hsa_executable_get_symbol_by_name(
2736       Executable, SymbolName.data(), &Agent, &Symbol);
2737   if (auto Err = Plugin::check(
2738           Status, "Error in hsa_executable_get_symbol_by_name(%s): %s",
2739           SymbolName.data()))
2740     return std::move(Err);
2741
2742   return Symbol;
2743 }
2744
2745 template <typename ResourceTy>
2746 Error AMDGPUResourceRef<ResourceTy>::create(GenericDeviceTy &Device) {
2747   if (Resource)
2748     return Plugin::error("Creating an existing resource");
2749
2750   AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(Device);
2751
2752   Resource = new ResourceTy(AMDGPUDevice);
2753
2754   return Resource->init();
2755 }
2756
2757 AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device)
2758     : Agent(Device.getAgent()), Queue(nullptr),
2759       SignalManager(Device.getSignalManager()), Device(Device),
2760       // Initialize the std::deque with some empty positions.
2761       Slots(32), NextSlot(0), SyncCycle(0), RPCServer(nullptr),
2762       StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()) {}
2763
2764 /// Class implementing the AMDGPU-specific functionalities of the global
2765 /// handler.
2766 struct AMDGPUGlobalHandlerTy final : public GenericGlobalHandlerTy {
2767   /// Get the metadata of a global from the device. The name and size of the
2768   /// global is read from DeviceGlobal and the address of the global is written
2769   /// to DeviceGlobal.
2770   Error getGlobalMetadataFromDevice(GenericDeviceTy &Device,
2771                                     DeviceImageTy &Image,
2772                                     GlobalTy &DeviceGlobal) override {
2773     AMDGPUDeviceImageTy &AMDImage = static_cast<AMDGPUDeviceImageTy &>(Image);
2774
2775     // Find the symbol on the device executable.
2776     auto SymbolOrErr =
2777         AMDImage.findDeviceSymbol(Device, DeviceGlobal.getName());
2778     if (!SymbolOrErr)
2779       return SymbolOrErr.takeError();
2780
2781     hsa_executable_symbol_t Symbol = *SymbolOrErr;
2782     hsa_symbol_kind_t SymbolType;
2783     hsa_status_t Status;
2784     uint64_t SymbolAddr;
2785     uint32_t SymbolSize;
2786
2787     // Retrieve the type, address and size of the symbol.
2788     std::pair<hsa_executable_symbol_info_t, void *> RequiredInfos[] = {
2789         {HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &SymbolType},
2790         {HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &SymbolAddr},
2791         {HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &SymbolSize}};
2792
2793     for (auto &Info : RequiredInfos) {
2794       Status = hsa_executable_symbol_get_info(Symbol, Info.first, Info.second);
2795       if (auto Err = Plugin::check(
2796               Status, "Error in hsa_executable_symbol_get_info: %s"))
2797         return Err;
2798     }
2799
2800     // Check the size of the symbol.
2801     if (SymbolSize != DeviceGlobal.getSize())
2802       return Plugin::error(
2803           "Failed to load global '%s' due to size mismatch (%zu != %zu)",
2804           DeviceGlobal.getName().data(), SymbolSize,
2805           (size_t)DeviceGlobal.getSize());
2806
2807     // Store the symbol address on the device global metadata.
2808     DeviceGlobal.setPtr(reinterpret_cast<void *>(SymbolAddr));
2809
2810     return Plugin::success();
2811   }
2812 };
2813
2814 /// Class implementing the AMDGPU-specific functionalities of the plugin.
2815 struct AMDGPUPluginTy final : public GenericPluginTy {
2816   /// Create an AMDGPU plugin and initialize the AMDGPU driver.
2817   AMDGPUPluginTy()
2818       : GenericPluginTy(getTripleArch()), Initialized(false),
2819         HostDevice(nullptr) {}
2820
2821   /// This class should not be copied.
2822   AMDGPUPluginTy(const AMDGPUPluginTy &) = delete;
2823   AMDGPUPluginTy(AMDGPUPluginTy &&) = delete;
2824
2825   /// Initialize the plugin and return the number of devices.
2826   Expected<int32_t> initImpl() override {
2827     hsa_status_t Status = hsa_init();
2828     if (Status != HSA_STATUS_SUCCESS) {
2829       // Cannot call hsa_success_string.
2830       DP("Failed to initialize AMDGPU's HSA library\n");
2831       return 0;
2832     }
2833
2834     // The initialization of HSA was successful. It should be safe to call
2835     // HSA functions from now on, e.g., hsa_shut_down.
2836     Initialized = true;
2837
2838 #ifdef OMPT_SUPPORT
2839     ompt::connectLibrary();
2840 #endif
2841
2842     // Register event handler to detect memory errors on the devices.
2843     Status = hsa_amd_register_system_event_handler(eventHandler, nullptr);
2844     if (auto Err = Plugin::check(
2845             Status, "Error in hsa_amd_register_system_event_handler: %s"))
2846       return std::move(Err);
2847
2848     // List of host (CPU) agents.
2849     llvm::SmallVector<hsa_agent_t> HostAgents;
2850
2851     // Count the number of available agents.
2852     auto Err = utils::iterateAgents([&](hsa_agent_t Agent) {
2853       // Get the device type of the agent.
2854       hsa_device_type_t DeviceType;
2855       hsa_status_t Status =
2856           hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DeviceType);
2857       if (Status != HSA_STATUS_SUCCESS)
2858         return Status;
2859
2860       // Classify the agents into kernel (GPU) and host (CPU) kernels.
2861       if (DeviceType == HSA_DEVICE_TYPE_GPU) {
2862         // Ensure that the GPU agent supports kernel dispatch packets.
2863         hsa_agent_feature_t Features;
2864         Status = hsa_agent_get_info(Agent, HSA_AGENT_INFO_FEATURE, &Features);
2865         if (Features & HSA_AGENT_FEATURE_KERNEL_DISPATCH)
2866           KernelAgents.push_back(Agent);
2867       } else if (DeviceType == HSA_DEVICE_TYPE_CPU) {
2868         HostAgents.push_back(Agent);
2869       }
2870       return HSA_STATUS_SUCCESS;
2871     });
2872
2873     if (Err)
2874       return std::move(Err);
2875
2876     int32_t NumDevices = KernelAgents.size();
2877     if (NumDevices == 0) {
2878       // Do not initialize if there are no devices.
2879       DP("There are no devices supporting AMDGPU.\n");
2880       return 0;
2881     }
2882
2883     // There are kernel agents but there is no host agent. That should be
2884     // treated as an error.
2885     if (HostAgents.empty())
2886       return Plugin::error("No AMDGPU host agents");
2887
2888     // Initialize the host device using host agents.
2889     HostDevice = allocate<AMDHostDeviceTy>();
2890     new (HostDevice) AMDHostDeviceTy(HostAgents);
2891
2892     // Setup the memory pools of available for the host.
2893     if (auto Err = HostDevice->init())
2894       return std::move(Err);
2895
2896     return NumDevices;
2897   }
2898
2899   /// Deinitialize the plugin.
2900   Error deinitImpl() override {
2901     // The HSA runtime was not initialized, so nothing from the plugin was
2902     // actually initialized.
2903     if (!Initialized)
2904       return Plugin::success();
2905
2906     if (HostDevice)
2907       if (auto Err = HostDevice->deinit())
2908         return Err;
2909
2910     // Finalize the HSA runtime.
2911     hsa_status_t Status = hsa_shut_down();
2912     return Plugin::check(Status, "Error in hsa_shut_down: %s");
2913   }
2914
2915   Triple::ArchType getTripleArch() const override { return Triple::amdgcn; }
2916
2917   /// Get the ELF code for recognizing the compatible image binary.
2918   uint16_t getMagicElfBits() const override { return ELF::EM_AMDGPU; }
2919
2920   /// Check whether the image is compatible with an AMDGPU device.
2921   Expected<bool> isImageCompatible(__tgt_image_info *Info) const override {
2922     for (hsa_agent_t Agent : KernelAgents) {
2923       std::string Target;
2924       auto Err = utils::iterateAgentISAs(Agent, [&](hsa_isa_t ISA) {
2925         uint32_t Length;
2926         hsa_status_t Status;
2927         Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME_LENGTH, &Length);
2928         if (Status != HSA_STATUS_SUCCESS)
2929           return Status;
2930
2931         llvm::SmallVector<char> ISAName(Length);
2932         Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME, ISAName.begin());
2933         if (Status != HSA_STATUS_SUCCESS)
2934           return Status;
2935
2936         llvm::StringRef TripleTarget(ISAName.begin(), Length);
2937         if (TripleTarget.consume_front("amdgcn-amd-amdhsa"))
2938           Target = TripleTarget.ltrim('-').rtrim('\0').str();
2939         return HSA_STATUS_SUCCESS;
2940       });
2941       if (Err)
2942         return std::move(Err);
2943
2944       if (!utils::isImageCompatibleWithEnv(Info, Target))
2945         return false;
2946     }
2947     return true;
2948   }
2949
2950   bool isDataExchangable(int32_t SrcDeviceId, int32_t DstDeviceId) override {
2951     return true;
2952   }
2953
2954   /// Get the host device instance.
2955   AMDHostDeviceTy &getHostDevice() {
2956     assert(HostDevice && "Host device not initialized");
2957     return *HostDevice;
2958   }
2959
2960   /// Get the kernel agent with the corresponding agent id.
2961   hsa_agent_t getKernelAgent(int32_t AgentId) const {
2962     assert((uint32_t)AgentId < KernelAgents.size() && "Invalid agent id");
2963     return KernelAgents[AgentId];
2964   }
2965
2966   /// Get the list of the available kernel agents.
2967   const llvm::SmallVector<hsa_agent_t> &getKernelAgents() const {
2968     return KernelAgents;
2969   }
2970
2971 private:
2972   /// Event handler that will be called by ROCr if an event is detected.
2973   static hsa_status_t eventHandler(const hsa_amd_event_t *Event, void *) {
2974     if (Event->event_type != HSA_AMD_GPU_MEMORY_FAULT_EVENT)
2975       return HSA_STATUS_SUCCESS;
2976
2977     SmallVector<std::string> Reasons;
2978     uint32_t ReasonsMask = Event->memory_fault.fault_reason_mask;
2979     if (ReasonsMask & HSA_AMD_MEMORY_FAULT_PAGE_NOT_PRESENT)
2980       Reasons.emplace_back("Page not present or supervisor privilege");
2981     if (ReasonsMask & HSA_AMD_MEMORY_FAULT_READ_ONLY)
2982       Reasons.emplace_back("Write access to a read-only page");
2983     if (ReasonsMask & HSA_AMD_MEMORY_FAULT_NX)
2984       Reasons.emplace_back("Execute access to a page marked NX");
2985     if (ReasonsMask & HSA_AMD_MEMORY_FAULT_HOST_ONLY)
2986       Reasons.emplace_back("GPU attempted access to a host only page");
2987     if (ReasonsMask & HSA_AMD_MEMORY_FAULT_DRAMECC)
2988       Reasons.emplace_back("DRAM ECC failure");
2989     if (ReasonsMask & HSA_AMD_MEMORY_FAULT_IMPRECISE)
2990       Reasons.emplace_back("Can't determine the exact fault address");
2991     if (ReasonsMask & HSA_AMD_MEMORY_FAULT_SRAMECC)
2992       Reasons.emplace_back("SRAM ECC failure (ie registers, no fault address)");
2993     if (ReasonsMask & HSA_AMD_MEMORY_FAULT_HANG)
2994       Reasons.emplace_back("GPU reset following unspecified hang");
2995
2996     // If we do not know the reason, say so, otherwise remove the trailing comma
2997     // and space.
2998     if (Reasons.empty())
2999       Reasons.emplace_back("Unknown (" + std::to_string(ReasonsMask) + ")");
3000
3001     uint32_t Node = -1;
3002     hsa_agent_get_info(Event->memory_fault.agent, HSA_AGENT_INFO_NODE, &Node);
3003
3004     // Abort the execution since we do not recover from this error.
3005     FATAL_MESSAGE(1,
3006                   "Memory access fault by GPU %" PRIu32 " (agent 0x%" PRIx64
3007                   ") at virtual address %p. Reasons: %s",
3008                   Node, Event->memory_fault.agent.handle,
3009                   (void *)Event->memory_fault.virtual_address,
3010                   llvm::join(Reasons, ", ").c_str());
3011
3012     return HSA_STATUS_ERROR;
3013   }
3014
3015   /// Indicate whether the HSA runtime was correctly initialized. Even if there
3016   /// is no available devices this boolean will be true. It indicates whether
3017   /// we can safely call HSA functions (e.g., hsa_shut_down).
3018   bool Initialized;
3019
3020   /// Arrays of the available GPU and CPU agents. These arrays of handles should
3021   /// not be here but in the AMDGPUDeviceTy structures directly. However, the
3022   /// HSA standard does not provide API functions to retirve agents directly,
3023   /// only iterating functions. We cache the agents here for convenience.
3024   llvm::SmallVector<hsa_agent_t> KernelAgents;
3025
3026   /// The device representing all HSA host agents.
3027   AMDHostDeviceTy *HostDevice;
3028 };
3029
3030 Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
3031                                  uint32_t NumThreads, uint64_t NumBlocks,
3032                                  KernelArgsTy &KernelArgs, void *Args,
3033                                  AsyncInfoWrapperTy &AsyncInfoWrapper) const {
3034   const uint32_t KernelArgsSize = KernelArgs.NumArgs * sizeof(void *);
3035
3036   if (ArgsSize < KernelArgsSize)
3037     return Plugin::error("Mismatch of kernel arguments size");
3038
3039   // The args size reported by HSA may or may not contain the implicit args.
3040   // For now, assume that HSA does not consider the implicit arguments when
3041   // reporting the arguments of a kernel. In the worst case, we can waste
3042   // 56 bytes per allocation.
3043   uint32_t AllArgsSize = KernelArgsSize + ImplicitArgsSize;
3044
3045   AMDHostDeviceTy &HostDevice = Plugin::get<AMDGPUPluginTy>().getHostDevice();
3046   AMDGPUMemoryManagerTy &ArgsMemoryManager = HostDevice.getArgsMemoryManager();
3047
3048   void *AllArgs = nullptr;
3049   if (auto Err = ArgsMemoryManager.allocate(AllArgsSize, &AllArgs))
3050     return Err;
3051
3052   // Account for user requested dynamic shared memory.
3053   uint32_t GroupSize = getGroupSize();
3054   if (uint32_t MaxDynCGroupMem = std::max(
3055           KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize())) {
3056     GroupSize += MaxDynCGroupMem;
3057   }
3058
3059   // Initialize implicit arguments.
3060   utils::AMDGPUImplicitArgsTy *ImplArgs =
3061       reinterpret_cast<utils::AMDGPUImplicitArgsTy *>(
3062           advanceVoidPtr(AllArgs, KernelArgsSize));
3063
3064   // Initialize the implicit arguments to zero.
3065   std::memset(ImplArgs, 0, ImplicitArgsSize);
3066
3067   // Copy the explicit arguments.
3068   // TODO: We should expose the args memory manager alloc to the common part as
3069   //       alternative to copying them twice.
3070   if (KernelArgs.NumArgs)
3071     std::memcpy(AllArgs, *static_cast<void **>(Args),
3072                 sizeof(void *) * KernelArgs.NumArgs);
3073
3074   AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(GenericDevice);
3075
3076   AMDGPUStreamTy *Stream = nullptr;
3077   if (auto Err = AMDGPUDevice.getStream(AsyncInfoWrapper, Stream))
3078     return Err;
3079
3080   // If this kernel requires an RPC server we attach its pointer to the stream.
3081   if (GenericDevice.getRPCServer())
3082     Stream->setRPCServer(GenericDevice.getRPCServer());
3083
3084   // Only COV5 implicitargs needs to be set. COV4 implicitargs are not used.
3085   if (getImplicitArgsSize() == sizeof(utils::AMDGPUImplicitArgsTy)) {
3086     ImplArgs->BlockCountX = NumBlocks;
3087     ImplArgs->GroupSizeX = NumThreads;
3088     ImplArgs->GroupSizeY = 1;
3089     ImplArgs->GroupSizeZ = 1;
3090     ImplArgs->GridDims = 1;
3091   }
3092
3093   // Push the kernel launch into the stream.
3094   return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks,
3095                                   GroupSize, ArgsMemoryManager);
3096 }
3097
3098 Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
3099                                              KernelArgsTy &KernelArgs,
3100                                              uint32_t NumThreads,
3101                                              uint64_t NumBlocks) const {
3102   // Only do all this when the output is requested
3103   if (!(getInfoLevel() & OMP_INFOTYPE_PLUGIN_KERNEL))
3104     return Plugin::success();
3105
3106   // We don't have data to print additional info, but no hard error
3107   if (!KernelInfo.has_value())
3108     return Plugin::success();
3109
3110   // General Info
3111   auto NumGroups = NumBlocks;
3112   auto ThreadsPerGroup = NumThreads;
3113
3114   // Kernel Arguments Info
3115   auto ArgNum = KernelArgs.NumArgs;
3116   auto LoopTripCount = KernelArgs.Tripcount;
3117
3118   // Details for AMDGPU kernels (read from image)
3119   // https://www.llvm.org/docs/AMDGPUUsage.html#code-object-v4-metadata
3120   auto GroupSegmentSize = (*KernelInfo).GroupSegmentList;
3121   auto SGPRCount = (*KernelInfo).SGPRCount;
3122   auto VGPRCount = (*KernelInfo).VGPRCount;
3123   auto SGPRSpillCount = (*KernelInfo).SGPRSpillCount;
3124   auto VGPRSpillCount = (*KernelInfo).VGPRSpillCount;
3125   auto MaxFlatWorkgroupSize = (*KernelInfo).MaxFlatWorkgroupSize;
3126
3127   // Prints additional launch info that contains the following.
3128   // Num Args: The number of kernel arguments
3129   // Teams x Thrds: The number of teams and the number of threads actually
3130   // running.
3131   // MaxFlatWorkgroupSize: Maximum flat work-group size supported by the
3132   // kernel in work-items
3133   // LDS Usage: Amount of bytes used in LDS storage
3134   // S/VGPR Count: the number of S/V GPRs occupied by the kernel
3135   // S/VGPR Spill Count: how many S/VGPRs are spilled by the kernel
3136   // Tripcount: loop tripcount for the kernel
3137   INFO(OMP_INFOTYPE_PLUGIN_KERNEL, GenericDevice.getDeviceId(),
3138        "#Args: %d Teams x Thrds: %4lux%4u (MaxFlatWorkGroupSize: %u) LDS "
3139        "Usage: %uB #SGPRs/VGPRs: %u/%u #SGPR/VGPR Spills: %u/%u Tripcount: "
3140        "%lu\n",
3141        ArgNum, NumGroups, ThreadsPerGroup, MaxFlatWorkgroupSize,
3142        GroupSegmentSize, SGPRCount, VGPRCount, SGPRSpillCount, VGPRSpillCount,
3143        LoopTripCount);
3144
3145   return Plugin::success();
3146 }
3147
3148 GenericPluginTy *Plugin::createPlugin() { return new AMDGPUPluginTy(); }
3149
3150 GenericDeviceTy *Plugin::createDevice(int32_t DeviceId, int32_t NumDevices) {
3151   AMDGPUPluginTy &Plugin = get<AMDGPUPluginTy &>();
3152   return new AMDGPUDeviceTy(DeviceId, NumDevices, Plugin.getHostDevice(),
3153                             Plugin.getKernelAgent(DeviceId));
3154 }
3155
3156 GenericGlobalHandlerTy *Plugin::createGlobalHandler() {
3157   return new AMDGPUGlobalHandlerTy();
3158 }
3159
3160 template <typename... ArgsTy>
3161 Error Plugin::check(int32_t Code, const char *ErrFmt, ArgsTy... Args) {
3162   hsa_status_t ResultCode = static_cast<hsa_status_t>(Code);
3163   if (ResultCode == HSA_STATUS_SUCCESS || ResultCode == HSA_STATUS_INFO_BREAK)
3164     return Error::success();
3165
3166   const char *Desc = "Unknown error";
3167   hsa_status_t Ret = hsa_status_string(ResultCode, &Desc);
3168   if (Ret != HSA_STATUS_SUCCESS)
3169     REPORT("Unrecognized " GETNAME(TARGET_NAME) " error code %d\n", Code);
3170
3171   return createStringError<ArgsTy..., const char *>(inconvertibleErrorCode(),
3172                                                     ErrFmt, Args..., Desc);
3173 }
3174
3175 void *AMDGPUMemoryManagerTy::allocate(size_t Size, void *HstPtr,
3176                                       TargetAllocTy Kind) {
3177   // Allocate memory from the pool.
3178   void *Ptr = nullptr;
3179   if (auto Err = MemoryPool->allocate(Size, &Ptr)) {
3180     consumeError(std::move(Err));
3181     return nullptr;
3182   }
3183   assert(Ptr && "Invalid pointer");
3184
3185   auto &KernelAgents = Plugin::get<AMDGPUPluginTy>().getKernelAgents();
3186
3187   // Allow all kernel agents to access the allocation.
3188   if (auto Err = MemoryPool->enableAccess(Ptr, Size, KernelAgents)) {
3189     REPORT("%s\n", toString(std::move(Err)).data());
3190     return nullptr;
3191   }
3192   return Ptr;
3193 }
3194
3195 void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) {
3196   if (Size == 0)
3197     return nullptr;
3198
3199   // Find the correct memory pool.
3200   AMDGPUMemoryPoolTy *MemoryPool = nullptr;
3201   switch (Kind) {
3202   case TARGET_ALLOC_DEFAULT:
3203   case TARGET_ALLOC_DEVICE:
3204     MemoryPool = CoarseGrainedMemoryPools[0];
3205     break;
3206   case TARGET_ALLOC_HOST:
3207     MemoryPool = &HostDevice.getFineGrainedMemoryPool();
3208     break;
3209   case TARGET_ALLOC_SHARED:
3210     MemoryPool = &HostDevice.getFineGrainedMemoryPool();
3211     break;
3212   }
3213
3214   if (!MemoryPool) {
3215     REPORT("No memory pool for the specified allocation kind\n");
3216     return nullptr;
3217   }
3218
3219   // Allocate from the corresponding memory pool.
3220   void *Alloc = nullptr;
3221   if (Error Err = MemoryPool->allocate(Size, &Alloc)) {
3222     REPORT("%s\n", toString(std::move(Err)).data());
3223     return nullptr;
3224   }
3225
3226   if (Alloc) {
3227     auto &KernelAgents = Plugin::get<AMDGPUPluginTy>().getKernelAgents();
3228     // Inherently necessary for host or shared allocations
3229     // Also enabled for device memory to allow device to device memcpy
3230
3231     // Enable all kernel agents to access the buffer.
3232     if (auto Err = MemoryPool->enableAccess(Alloc, Size, KernelAgents)) {
3233       REPORT("%s\n", toString(std::move(Err)).data());
3234       return nullptr;
3235     }
3236   }
3237
3238   return Alloc;
3239 }
3240
3241 } // namespace plugin
3242 } // namespace target
3243 } // namespace omp
3244 } // namespace llvm