openmp/libomptarget/plugins/cuda/src/rtl.cpp

   1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // RTL for CUDA machine
  10 //
  11 //===----------------------------------------------------------------------===//
  12
  13 #include <cassert>
  14 #include <cstddef>
  15 #include <cuda.h>
  16 #include <list>
  17 #include <memory>
  18 #include <mutex>
  19 #include <string>
  20 #include <unordered_map>
  21 #include <vector>
  22
  23 #include "Debug.h"
  24 #include "DeviceEnvironment.h"
  25 #include "omptargetplugin.h"
  26
  27 #define TARGET_NAME CUDA
  28 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
  29
  30 #include "MemoryManager.h"
  31
  32 #include "llvm/Frontend/OpenMP/OMPConstants.h"
  33
  34 // Utility for retrieving and printing CUDA error string.
  35 #ifdef OMPTARGET_DEBUG
  36 #define CUDA_ERR_STRING(err)                                                   \
  37   do {                                                                         \
  38     if (getDebugLevel() > 0) {                                                 \
  39       const char *errStr = nullptr;                                            \
  40       CUresult errStr_status = cuGetErrorString(err, &errStr);                 \
  41       if (errStr_status == CUDA_ERROR_INVALID_VALUE)                           \
  42         REPORT("Unrecognized CUDA error code: %d\n", err);                     \
  43       else if (errStr_status == CUDA_SUCCESS)                                  \
  44         REPORT("CUDA error is: %s\n", errStr);                                 \
  45       else {                                                                   \
  46         REPORT("Unresolved CUDA error code: %d\n", err);                       \
  47         REPORT("Unsuccessful cuGetErrorString return status: %d\n",            \
  48                errStr_status);                                                 \
  49       }                                                                        \
  50     } else {                                                                   \
  51       const char *errStr = nullptr;                                            \
  52       CUresult errStr_status = cuGetErrorString(err, &errStr);                 \
  53       if (errStr_status == CUDA_SUCCESS)                                       \
  54         REPORT("%s \n", errStr);                                               \
  55     }                                                                          \
  56   } while (false)
  57 #else // OMPTARGET_DEBUG
  58 #define CUDA_ERR_STRING(err)                                                   \
  59   do {                                                                         \
  60     const char *errStr = nullptr;                                              \
  61     CUresult errStr_status = cuGetErrorString(err, &errStr);                   \
  62     if (errStr_status == CUDA_SUCCESS)                                         \
  63       REPORT("%s \n", errStr);                                                 \
  64   } while (false)
  65 #endif // OMPTARGET_DEBUG
  66
  67 #define BOOL2TEXT(b) ((b) ? "Yes" : "No")
  68
  69 #include "elf_common.h"
  70
  71 /// Keep entries table per device.
  72 struct FuncOrGblEntryTy {
  73   __tgt_target_table Table;
  74   std::vector<__tgt_offload_entry> Entries;
  75 };
  76
  77 /// Use a single entity to encode a kernel and a set of flags.
  78 struct KernelTy {
  79   CUfunction Func;
  80
  81   // execution mode of kernel
  82   llvm::omp::OMPTgtExecModeFlags ExecutionMode;
  83
  84   /// Maximal number of threads per block for this kernel.
  85   int MaxThreadsPerBlock = 0;
  86
  87   KernelTy(CUfunction _Func, llvm::omp::OMPTgtExecModeFlags _ExecutionMode)
  88       : Func(_Func), ExecutionMode(_ExecutionMode) {}
  89 };
  90
  91 namespace {
  92 bool checkResult(CUresult Err, const char *ErrMsg) {
  93   if (Err == CUDA_SUCCESS)
  94     return true;
  95
  96   REPORT("%s", ErrMsg);
  97   CUDA_ERR_STRING(Err);
  98   return false;
  99 }
 100
 101 int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size,
 102                CUstream Stream) {
 103   CUresult Err =
 104       cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream);
 105
 106   if (Err != CUDA_SUCCESS) {
 107     DP("Error when copying data from device to device. Pointers: src "
 108        "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n",
 109        DPxPTR(SrcPtr), DPxPTR(DstPtr), Size);
 110     CUDA_ERR_STRING(Err);
 111     return OFFLOAD_FAIL;
 112   }
 113
 114   return OFFLOAD_SUCCESS;
 115 }
 116
 117 int recordEvent(void *EventPtr, __tgt_async_info *AsyncInfo) {
 118   CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue);
 119   CUevent Event = reinterpret_cast<CUevent>(EventPtr);
 120
 121   CUresult Err = cuEventRecord(Event, Stream);
 122   if (Err != CUDA_SUCCESS) {
 123     DP("Error when recording event. stream = " DPxMOD ", event = " DPxMOD "\n",
 124        DPxPTR(Stream), DPxPTR(Event));
 125     CUDA_ERR_STRING(Err);
 126     return OFFLOAD_FAIL;
 127   }
 128
 129   return OFFLOAD_SUCCESS;
 130 }
 131
 132 int syncEvent(void *EventPtr) {
 133   CUevent Event = reinterpret_cast<CUevent>(EventPtr);
 134
 135   CUresult Err = cuEventSynchronize(Event);
 136   if (Err != CUDA_SUCCESS) {
 137     DP("Error when syncing event = " DPxMOD "\n", DPxPTR(Event));
 138     CUDA_ERR_STRING(Err);
 139     return OFFLOAD_FAIL;
 140   }
 141
 142   return OFFLOAD_SUCCESS;
 143 }
 144
 145 // Structure contains per-device data
 146 struct DeviceDataTy {
 147   /// List that contains all the kernels.
 148   std::list<KernelTy> KernelsList;
 149
 150   std::list<FuncOrGblEntryTy> FuncGblEntries;
 151
 152   CUcontext Context = nullptr;
 153   // Device properties
 154   int ThreadsPerBlock = 0;
 155   int BlocksPerGrid = 0;
 156   int WarpSize = 0;
 157   // OpenMP properties
 158   int NumTeams = 0;
 159   int NumThreads = 0;
 160 };
 161
 162 /// Resource allocator where \p T is the resource type.
 163 /// Functions \p create and \p destroy return OFFLOAD_SUCCESS and OFFLOAD_FAIL
 164 /// accordingly. The implementation should not raise any exception.
 165 template <typename T> class AllocatorTy {
 166 public:
 167   /// Create a resource and assign to R.
 168   int create(T &R) noexcept;
 169   /// Destroy the resource.
 170   int destroy(T) noexcept;
 171 };
 172
 173 /// Allocator for CUstream.
 174 template <> class AllocatorTy<CUstream> {
 175   CUcontext Context;
 176
 177 public:
 178   AllocatorTy(CUcontext C) noexcept : Context(C) {}
 179
 180   /// See AllocatorTy<T>::create.
 181   int create(CUstream &Stream) noexcept {
 182     if (!checkResult(cuCtxSetCurrent(Context),
 183                      "Error returned from cuCtxSetCurrent\n"))
 184       return OFFLOAD_FAIL;
 185
 186     if (!checkResult(cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING),
 187                      "Error returned from cuStreamCreate\n"))
 188       return OFFLOAD_FAIL;
 189
 190     return OFFLOAD_SUCCESS;
 191   }
 192
 193   /// See AllocatorTy<T>::destroy.
 194   int destroy(CUstream Stream) noexcept {
 195     if (!checkResult(cuCtxSetCurrent(Context),
 196                      "Error returned from cuCtxSetCurrent\n"))
 197       return OFFLOAD_FAIL;
 198     if (!checkResult(cuStreamDestroy(Stream),
 199                      "Error returned from cuStreamDestroy\n"))
 200       return OFFLOAD_FAIL;
 201
 202     return OFFLOAD_SUCCESS;
 203   }
 204 };
 205
 206 /// Allocator for CUevent.
 207 template <> class AllocatorTy<CUevent> {
 208 public:
 209   /// See AllocatorTy<T>::create.
 210   int create(CUevent &Event) noexcept {
 211     if (!checkResult(cuEventCreate(&Event, CU_EVENT_DEFAULT),
 212                      "Error returned from cuEventCreate\n"))
 213       return OFFLOAD_FAIL;
 214
 215     return OFFLOAD_SUCCESS;
 216   }
 217
 218   /// See AllocatorTy<T>::destroy.
 219   int destroy(CUevent Event) noexcept {
 220     if (!checkResult(cuEventDestroy(Event),
 221                      "Error returned from cuEventDestroy\n"))
 222       return OFFLOAD_FAIL;
 223
 224     return OFFLOAD_SUCCESS;
 225   }
 226 };
 227
 228 /// A generic pool of resources where \p T is the resource type.
 229 /// \p T should be copyable as the object is stored in \p std::vector .
 230 template <typename T> class ResourcePoolTy {
 231   /// Index of the next available resource.
 232   size_t Next = 0;
 233   /// Mutex to guard the pool.
 234   std::mutex Mutex;
 235   /// Pool of resources.
 236   std::vector<T> Resources;
 237   /// A reference to the corresponding allocator.
 238   AllocatorTy<T> Allocator;
 239
 240   /// If `Resources` is used up, we will fill in more resources. It assumes that
 241   /// the new size `Size` should be always larger than the current size.
 242   bool resize(size_t Size) {
 243     auto CurSize = Resources.size();
 244     assert(Size > CurSize && "Unexpected smaller size");
 245     Resources.reserve(Size);
 246     for (auto I = CurSize; I < Size; ++I) {
 247       T NewItem;
 248       int Ret = Allocator.create(NewItem);
 249       if (Ret != OFFLOAD_SUCCESS)
 250         return false;
 251       Resources.push_back(NewItem);
 252     }
 253     return true;
 254   }
 255
 256 public:
 257   ResourcePoolTy(AllocatorTy<T> &&A, size_t Size = 0) noexcept
 258       : Allocator(std::move(A)) {
 259     if (Size)
 260       (void)resize(Size);
 261   }
 262
 263   ~ResourcePoolTy() noexcept { clear(); }
 264
 265   /// Get a resource from pool. `Next` always points to the next available
 266   /// resource. That means, `[0, next-1]` have been assigned, and `[id,]` are
 267   /// still available. If there is no resource left, we will ask for more. Each
 268   /// time a resource is assigned, the id will increase one.
 269   /// xxxxxs+++++++++
 270   ///      ^
 271   ///      Next
 272   /// After assignment, the pool becomes the following and s is assigned.
 273   /// xxxxxs+++++++++
 274   ///       ^
 275   ///       Next
 276   int acquire(T &R) noexcept {
 277     std::lock_guard<std::mutex> LG(Mutex);
 278     if (Next == Resources.size()) {
 279       auto NewSize = Resources.size() ? Resources.size() * 2 : 1;
 280       if (!resize(NewSize))
 281         return OFFLOAD_FAIL;
 282     }
 283
 284     assert(Next < Resources.size());
 285
 286     R = Resources[Next++];
 287
 288     return OFFLOAD_SUCCESS;
 289   }
 290
 291   /// Return the resource back to the pool. When we return a resource, we need
 292   /// to first decrease `Next`, and then copy the resource back. It is worth
 293   /// noting that, the order of resources return might be different from that
 294   /// they're assigned, that saying, at some point, there might be two identical
 295   /// resources.
 296   /// xxax+a+++++
 297   ///     ^
 298   ///     Next
 299   /// However, it doesn't matter, because they're always on the two sides of
 300   /// `Next`. The left one will in the end be overwritten by another resource.
 301   /// Therefore, after several execution, the order of pool might be different
 302   /// from its initial state.
 303   void release(T R) noexcept {
 304     std::lock_guard<std::mutex> LG(Mutex);
 305     Resources[--Next] = R;
 306   }
 307
 308   /// Released all stored resources and clear the pool.
 309   /// Note: This function is not thread safe. Be sure to guard it if necessary.
 310   void clear() noexcept {
 311     for (auto &R : Resources)
 312       (void)Allocator.destroy(R);
 313     Resources.clear();
 314   }
 315 };
 316
 317 class DeviceRTLTy {
 318   int NumberOfDevices;
 319   // OpenMP environment properties
 320   int EnvNumTeams;
 321   int EnvTeamLimit;
 322   int EnvTeamThreadLimit;
 323   // OpenMP requires flags
 324   int64_t RequiresFlags;
 325   // Amount of dynamic shared memory to use at launch.
 326   uint64_t DynamicMemorySize;
 327   // Number of initial streams for each device.
 328   int NumInitialStreams = 32;
 329
 330   static constexpr const int HardTeamLimit = 1U << 16U; // 64k
 331   static constexpr const int HardThreadLimit = 1024;
 332   static constexpr const int DefaultNumTeams = 128;
 333   static constexpr const int DefaultNumThreads = 128;
 334
 335   using StreamPoolTy = ResourcePoolTy<CUstream>;
 336   std::vector<std::unique_ptr<StreamPoolTy>> StreamPool;
 337
 338   ResourcePoolTy<CUevent> EventPool;
 339
 340   std::vector<DeviceDataTy> DeviceData;
 341   std::vector<CUmodule> Modules;
 342
 343   /// A class responsible for interacting with device native runtime library to
 344   /// allocate and free memory.
 345   class CUDADeviceAllocatorTy : public DeviceAllocatorTy {
 346     const int DeviceId;
 347     const std::vector<DeviceDataTy> &DeviceData;
 348     std::unordered_map<void *, TargetAllocTy> HostPinnedAllocs;
 349
 350   public:
 351     CUDADeviceAllocatorTy(int DeviceId, std::vector<DeviceDataTy> &DeviceData)
 352         : DeviceId(DeviceId), DeviceData(DeviceData) {}
 353
 354     void *allocate(size_t Size, void *, TargetAllocTy Kind) override {
 355       if (Size == 0)
 356         return nullptr;
 357
 358       CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
 359       if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
 360         return nullptr;
 361
 362       void *MemAlloc = nullptr;
 363       switch (Kind) {
 364       case TARGET_ALLOC_DEFAULT:
 365       case TARGET_ALLOC_DEVICE:
 366         CUdeviceptr DevicePtr;
 367         Err = cuMemAlloc(&DevicePtr, Size);
 368         MemAlloc = (void *)DevicePtr;
 369         if (!checkResult(Err, "Error returned from cuMemAlloc\n"))
 370           return nullptr;
 371         break;
 372       case TARGET_ALLOC_HOST:
 373         void *HostPtr;
 374         Err = cuMemAllocHost(&HostPtr, Size);
 375         MemAlloc = HostPtr;
 376         if (!checkResult(Err, "Error returned from cuMemAllocHost\n"))
 377           return nullptr;
 378         HostPinnedAllocs[MemAlloc] = Kind;
 379         break;
 380       case TARGET_ALLOC_SHARED:
 381         CUdeviceptr SharedPtr;
 382         Err = cuMemAllocManaged(&SharedPtr, Size, CU_MEM_ATTACH_GLOBAL);
 383         MemAlloc = (void *)SharedPtr;
 384         if (!checkResult(Err, "Error returned from cuMemAllocManaged\n"))
 385           return nullptr;
 386         break;
 387       }
 388
 389       return MemAlloc;
 390     }
 391
 392     int free(void *TgtPtr) override {
 393       CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
 394       if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
 395         return OFFLOAD_FAIL;
 396
 397       // Host pinned memory must be freed differently.
 398       TargetAllocTy Kind =
 399           (HostPinnedAllocs.find(TgtPtr) == HostPinnedAllocs.end())
 400               ? TARGET_ALLOC_DEFAULT
 401               : TARGET_ALLOC_HOST;
 402       switch (Kind) {
 403       case TARGET_ALLOC_DEFAULT:
 404       case TARGET_ALLOC_DEVICE:
 405       case TARGET_ALLOC_SHARED:
 406         Err = cuMemFree((CUdeviceptr)TgtPtr);
 407         if (!checkResult(Err, "Error returned from cuMemFree\n"))
 408           return OFFLOAD_FAIL;
 409         break;
 410       case TARGET_ALLOC_HOST:
 411         Err = cuMemFreeHost(TgtPtr);
 412         if (!checkResult(Err, "Error returned from cuMemFreeHost\n"))
 413           return OFFLOAD_FAIL;
 414         break;
 415       }
 416
 417       return OFFLOAD_SUCCESS;
 418     }
 419   };
 420
 421   /// A vector of device allocators
 422   std::vector<CUDADeviceAllocatorTy> DeviceAllocators;
 423
 424   /// A vector of memory managers. Since the memory manager is non-copyable and
 425   // non-removable, we wrap them into std::unique_ptr.
 426   std::vector<std::unique_ptr<MemoryManagerTy>> MemoryManagers;
 427
 428   /// Whether use memory manager
 429   bool UseMemoryManager = true;
 430
 431   // Record entry point associated with device
 432   void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) {
 433     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
 434     E.Entries.push_back(entry);
 435   }
 436
 437   // Return a pointer to the entry associated with the pointer
 438   const __tgt_offload_entry *getOffloadEntry(const int DeviceId,
 439                                              const void *Addr) const {
 440     for (const __tgt_offload_entry &Itr :
 441          DeviceData[DeviceId].FuncGblEntries.back().Entries)
 442       if (Itr.addr == Addr)
 443         return &Itr;
 444
 445     return nullptr;
 446   }
 447
 448   // Return the pointer to the target entries table
 449   __tgt_target_table *getOffloadEntriesTable(const int DeviceId) {
 450     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
 451
 452     if (E.Entries.empty())
 453       return nullptr;
 454
 455     // Update table info according to the entries and return the pointer
 456     E.Table.EntriesBegin = E.Entries.data();
 457     E.Table.EntriesEnd = E.Entries.data() + E.Entries.size();
 458
 459     return &E.Table;
 460   }
 461
 462   // Clear entries table for a device
 463   void clearOffloadEntriesTable(const int DeviceId) {
 464     DeviceData[DeviceId].FuncGblEntries.emplace_back();
 465     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
 466     E.Entries.clear();
 467     E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr;
 468   }
 469
 470   CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const {
 471     assert(AsyncInfo && "AsyncInfo is nullptr");
 472
 473     if (!AsyncInfo->Queue) {
 474       CUstream S;
 475       if (StreamPool[DeviceId]->acquire(S) != OFFLOAD_SUCCESS)
 476         return nullptr;
 477
 478       AsyncInfo->Queue = S;
 479     }
 480
 481     return reinterpret_cast<CUstream>(AsyncInfo->Queue);
 482   }
 483
 484 public:
 485   // This class should not be copied
 486   DeviceRTLTy(const DeviceRTLTy &) = delete;
 487   DeviceRTLTy(DeviceRTLTy &&) = delete;
 488
 489   DeviceRTLTy()
 490       : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1),
 491         EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED),
 492         DynamicMemorySize(0), EventPool(AllocatorTy<CUevent>()) {
 493
 494     DP("Start initializing CUDA\n");
 495
 496     CUresult Err = cuInit(0);
 497     if (Err == CUDA_ERROR_INVALID_HANDLE) {
 498       // Can't call cuGetErrorString if dlsym failed
 499       DP("Failed to load CUDA shared library\n");
 500       return;
 501     }
 502     if (!checkResult(Err, "Error returned from cuInit\n")) {
 503       return;
 504     }
 505
 506     Err = cuDeviceGetCount(&NumberOfDevices);
 507     if (!checkResult(Err, "Error returned from cuDeviceGetCount\n"))
 508       return;
 509
 510     if (NumberOfDevices == 0) {
 511       DP("There are no devices supporting CUDA.\n");
 512       return;
 513     }
 514
 515     DeviceData.resize(NumberOfDevices);
 516     StreamPool.resize(NumberOfDevices);
 517
 518     // Get environment variables regarding teams
 519     if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) {
 520       // OMP_TEAM_LIMIT has been set
 521       EnvTeamLimit = std::stoi(EnvStr);
 522       DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
 523     }
 524     if (const char *EnvStr = getenv("OMP_TEAMS_THREAD_LIMIT")) {
 525       // OMP_TEAMS_THREAD_LIMIT has been set
 526       EnvTeamThreadLimit = std::stoi(EnvStr);
 527       DP("Parsed OMP_TEAMS_THREAD_LIMIT=%d\n", EnvTeamThreadLimit);
 528     }
 529     if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) {
 530       // OMP_NUM_TEAMS has been set
 531       EnvNumTeams = std::stoi(EnvStr);
 532       DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
 533     }
 534     if (const char *EnvStr = getenv("LIBOMPTARGET_SHARED_MEMORY_SIZE")) {
 535       // LIBOMPTARGET_SHARED_MEMORY_SIZE has been set
 536       DynamicMemorySize = std::stoi(EnvStr);
 537       DP("Parsed LIBOMPTARGET_SHARED_MEMORY_SIZE = %" PRIu64 "\n",
 538          DynamicMemorySize);
 539     }
 540     if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS")) {
 541       // LIBOMPTARGET_NUM_INITIAL_STREAMS has been set
 542       NumInitialStreams = std::stoi(EnvStr);
 543       DP("Parsed LIBOMPTARGET_NUM_INITIAL_STREAMS=%d\n", NumInitialStreams);
 544     }
 545
 546     for (int I = 0; I < NumberOfDevices; ++I)
 547       DeviceAllocators.emplace_back(I, DeviceData);
 548
 549     // Get the size threshold from environment variable
 550     std::pair<size_t, bool> Res = MemoryManagerTy::getSizeThresholdFromEnv();
 551     UseMemoryManager = Res.second;
 552     size_t MemoryManagerThreshold = Res.first;
 553
 554     if (UseMemoryManager)
 555       for (int I = 0; I < NumberOfDevices; ++I)
 556         MemoryManagers.emplace_back(std::make_unique<MemoryManagerTy>(
 557             DeviceAllocators[I], MemoryManagerThreshold));
 558   }
 559
 560   ~DeviceRTLTy() {
 561     // We first destruct memory managers in case that its dependent data are
 562     // destroyed before it.
 563     for (auto &M : MemoryManagers)
 564       M.release();
 565
 566     for (CUmodule &M : Modules)
 567       // Close module
 568       if (M)
 569         checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n");
 570
 571     for (auto &S : StreamPool)
 572       S.reset();
 573
 574     EventPool.clear();
 575
 576     for (DeviceDataTy &D : DeviceData) {
 577       // Destroy context
 578       if (D.Context) {
 579         checkResult(cuCtxSetCurrent(D.Context),
 580                     "Error returned from cuCtxSetCurrent\n");
 581         CUdevice Device;
 582         checkResult(cuCtxGetDevice(&Device),
 583                     "Error returned from cuCtxGetDevice\n");
 584         checkResult(cuDevicePrimaryCtxRelease(Device),
 585                     "Error returned from cuDevicePrimaryCtxRelease\n");
 586       }
 587     }
 588   }
 589
 590   // Check whether a given DeviceId is valid
 591   bool isValidDeviceId(const int DeviceId) const {
 592     return DeviceId >= 0 && DeviceId < NumberOfDevices;
 593   }
 594
 595   int getNumOfDevices() const { return NumberOfDevices; }
 596
 597   void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; }
 598
 599   int initDevice(const int DeviceId) {
 600     CUdevice Device;
 601
 602     DP("Getting device %d\n", DeviceId);
 603     CUresult Err = cuDeviceGet(&Device, DeviceId);
 604     if (!checkResult(Err, "Error returned from cuDeviceGet\n"))
 605       return OFFLOAD_FAIL;
 606
 607     // Query the current flags of the primary context and set its flags if
 608     // it is inactive
 609     unsigned int FormerPrimaryCtxFlags = 0;
 610     int FormerPrimaryCtxIsActive = 0;
 611     Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags,
 612                                      &FormerPrimaryCtxIsActive);
 613     if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n"))
 614       return OFFLOAD_FAIL;
 615
 616     if (FormerPrimaryCtxIsActive) {
 617       DP("The primary context is active, no change to its flags\n");
 618       if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) !=
 619           CU_CTX_SCHED_BLOCKING_SYNC)
 620         DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n");
 621     } else {
 622       DP("The primary context is inactive, set its flags to "
 623          "CU_CTX_SCHED_BLOCKING_SYNC\n");
 624       Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC);
 625       if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n"))
 626         return OFFLOAD_FAIL;
 627     }
 628
 629     // Retain the per device primary context and save it to use whenever this
 630     // device is selected.
 631     Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device);
 632     if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n"))
 633       return OFFLOAD_FAIL;
 634
 635     Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
 636     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
 637       return OFFLOAD_FAIL;
 638
 639     // Initialize stream pool
 640     if (!StreamPool[DeviceId])
 641       StreamPool[DeviceId] = std::make_unique<StreamPoolTy>(
 642           AllocatorTy<CUstream>(DeviceData[DeviceId].Context),
 643           NumInitialStreams);
 644
 645     // Query attributes to determine number of threads/block and blocks/grid.
 646     int MaxGridDimX;
 647     Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
 648                                Device);
 649     if (Err != CUDA_SUCCESS) {
 650       DP("Error getting max grid dimension, use default value %d\n",
 651          DeviceRTLTy::DefaultNumTeams);
 652       DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams;
 653     } else if (MaxGridDimX <= DeviceRTLTy::HardTeamLimit) {
 654       DP("Using %d CUDA blocks per grid\n", MaxGridDimX);
 655       DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX;
 656     } else {
 657       DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
 658          "at the hard limit\n",
 659          MaxGridDimX, DeviceRTLTy::HardTeamLimit);
 660       DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::HardTeamLimit;
 661     }
 662
 663     // We are only exploiting threads along the x axis.
 664     int MaxBlockDimX;
 665     Err = cuDeviceGetAttribute(&MaxBlockDimX,
 666                                CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device);
 667     if (Err != CUDA_SUCCESS) {
 668       DP("Error getting max block dimension, use default value %d\n",
 669          DeviceRTLTy::DefaultNumThreads);
 670       DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads;
 671     } else {
 672       DP("Using %d CUDA threads per block\n", MaxBlockDimX);
 673       DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX;
 674
 675       if (EnvTeamThreadLimit > 0 &&
 676           DeviceData[DeviceId].ThreadsPerBlock > EnvTeamThreadLimit) {
 677         DP("Max CUDA threads per block %d exceeds the thread limit %d set by "
 678            "OMP_TEAMS_THREAD_LIMIT, capping at the limit\n",
 679            DeviceData[DeviceId].ThreadsPerBlock, EnvTeamThreadLimit);
 680         DeviceData[DeviceId].ThreadsPerBlock = EnvTeamThreadLimit;
 681       }
 682       if (DeviceData[DeviceId].ThreadsPerBlock > DeviceRTLTy::HardThreadLimit) {
 683         DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
 684            "capping at the hard limit\n",
 685            DeviceData[DeviceId].ThreadsPerBlock, DeviceRTLTy::HardThreadLimit);
 686         DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit;
 687       }
 688     }
 689
 690     // Get and set warp size
 691     int WarpSize;
 692     Err =
 693         cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device);
 694     if (Err != CUDA_SUCCESS) {
 695       DP("Error getting warp size, assume default value 32\n");
 696       DeviceData[DeviceId].WarpSize = 32;
 697     } else {
 698       DP("Using warp size %d\n", WarpSize);
 699       DeviceData[DeviceId].WarpSize = WarpSize;
 700     }
 701
 702     // Adjust teams to the env variables
 703     if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) {
 704       DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
 705          EnvTeamLimit);
 706       DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit;
 707     }
 708
 709     size_t StackLimit;
 710     size_t HeapLimit;
 711     if (const char *EnvStr = getenv("LIBOMPTARGET_STACK_SIZE")) {
 712       StackLimit = std::stol(EnvStr);
 713       if (cuCtxSetLimit(CU_LIMIT_STACK_SIZE, StackLimit) != CUDA_SUCCESS)
 714         return OFFLOAD_FAIL;
 715     } else {
 716       if (cuCtxGetLimit(&StackLimit, CU_LIMIT_STACK_SIZE) != CUDA_SUCCESS)
 717         return OFFLOAD_FAIL;
 718     }
 719     if (const char *EnvStr = getenv("LIBOMPTARGET_HEAP_SIZE")) {
 720       HeapLimit = std::stol(EnvStr);
 721       if (cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE, HeapLimit) != CUDA_SUCCESS)
 722         return OFFLOAD_FAIL;
 723     } else {
 724       if (cuCtxGetLimit(&HeapLimit, CU_LIMIT_MALLOC_HEAP_SIZE) != CUDA_SUCCESS)
 725         return OFFLOAD_FAIL;
 726     }
 727
 728     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
 729          "Device supports up to %d CUDA blocks and %d threads with a "
 730          "warp size of %d\n",
 731          DeviceData[DeviceId].BlocksPerGrid,
 732          DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize);
 733     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
 734          "Device heap size is %d Bytes, device stack size is %d Bytes per "
 735          "thread\n",
 736          (int)HeapLimit, (int)StackLimit);
 737
 738     // Set default number of teams
 739     if (EnvNumTeams > 0) {
 740       DP("Default number of teams set according to environment %d\n",
 741          EnvNumTeams);
 742       DeviceData[DeviceId].NumTeams = EnvNumTeams;
 743     } else {
 744       DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams;
 745       DP("Default number of teams set according to library's default %d\n",
 746          DeviceRTLTy::DefaultNumTeams);
 747     }
 748
 749     if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) {
 750       DP("Default number of teams exceeds device limit, capping at %d\n",
 751          DeviceData[DeviceId].BlocksPerGrid);
 752       DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid;
 753     }
 754
 755     // Set default number of threads
 756     DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads;
 757     DP("Default number of threads set according to library's default %d\n",
 758        DeviceRTLTy::DefaultNumThreads);
 759     if (DeviceData[DeviceId].NumThreads >
 760         DeviceData[DeviceId].ThreadsPerBlock) {
 761       DP("Default number of threads exceeds device limit, capping at %d\n",
 762          DeviceData[DeviceId].ThreadsPerBlock);
 763       DeviceData[DeviceId].NumThreads = DeviceData[DeviceId].ThreadsPerBlock;
 764     }
 765
 766     return OFFLOAD_SUCCESS;
 767   }
 768
 769   __tgt_target_table *loadBinary(const int DeviceId,
 770                                  const __tgt_device_image *Image) {
 771     // Set the context we are using
 772     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
 773     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
 774       return nullptr;
 775
 776     // Clear the offload table as we are going to create a new one.
 777     clearOffloadEntriesTable(DeviceId);
 778
 779     // Create the module and extract the function pointers.
 780     CUmodule Module;
 781     DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart));
 782     Err = cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr);
 783     if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n"))
 784       return nullptr;
 785
 786     DP("CUDA module successfully loaded!\n");
 787
 788     Modules.push_back(Module);
 789
 790     // Find the symbols in the module by name.
 791     const __tgt_offload_entry *HostBegin = Image->EntriesBegin;
 792     const __tgt_offload_entry *HostEnd = Image->EntriesEnd;
 793
 794     std::list<KernelTy> &KernelsList = DeviceData[DeviceId].KernelsList;
 795     for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
 796       if (!E->addr) {
 797         // We return nullptr when something like this happens, the host should
 798         // have always something in the address to uniquely identify the target
 799         // region.
 800         DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size);
 801         return nullptr;
 802       }
 803
 804       if (E->size) {
 805         __tgt_offload_entry Entry = *E;
 806         CUdeviceptr CUPtr;
 807         size_t CUSize;
 808         Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name);
 809         // We keep this style here because we need the name
 810         if (Err != CUDA_SUCCESS) {
 811           REPORT("Loading global '%s' Failed\n", E->name);
 812           CUDA_ERR_STRING(Err);
 813           return nullptr;
 814         }
 815
 816         if (CUSize != E->size) {
 817           DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name,
 818              CUSize, E->size);
 819           return nullptr;
 820         }
 821
 822         DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
 823            DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr));
 824
 825         Entry.addr = (void *)(CUPtr);
 826
 827         // Note: In the current implementation declare target variables
 828         // can either be link or to. This means that once unified
 829         // memory is activated via the requires directive, the variable
 830         // can be used directly from the host in both cases.
 831         // TODO: when variables types other than to or link are added,
 832         // the below condition should be changed to explicitly
 833         // check for to and link variables types:
 834         // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags &
 835         // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO))
 836         if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
 837           // If unified memory is present any target link or to variables
 838           // can access host addresses directly. There is no longer a
 839           // need for device copies.
 840           cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *));
 841           DP("Copy linked variable host address (" DPxMOD
 842              ") to device address (" DPxMOD ")\n",
 843              DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr));
 844         }
 845
 846         addOffloadEntry(DeviceId, Entry);
 847
 848         continue;
 849       }
 850
 851       CUfunction Func;
 852       Err = cuModuleGetFunction(&Func, Module, E->name);
 853       // We keep this style here because we need the name
 854       if (Err != CUDA_SUCCESS) {
 855         REPORT("Loading '%s' Failed\n", E->name);
 856         CUDA_ERR_STRING(Err);
 857         return nullptr;
 858       }
 859
 860       DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
 861          DPxPTR(E - HostBegin), E->name, DPxPTR(Func));
 862
 863       // default value GENERIC (in case symbol is missing from cubin file)
 864       llvm::omp::OMPTgtExecModeFlags ExecModeVal;
 865       std::string ExecModeNameStr(E->name);
 866       ExecModeNameStr += "_exec_mode";
 867       const char *ExecModeName = ExecModeNameStr.c_str();
 868
 869       CUdeviceptr ExecModePtr;
 870       size_t CUSize;
 871       Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName);
 872       if (Err == CUDA_SUCCESS) {
 873         if (CUSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) {
 874           DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
 875              ExecModeName, CUSize, sizeof(llvm::omp::OMPTgtExecModeFlags));
 876           return nullptr;
 877         }
 878
 879         Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize);
 880         if (Err != CUDA_SUCCESS) {
 881           REPORT("Error when copying data from device to host. Pointers: "
 882                  "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
 883                  DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize);
 884           CUDA_ERR_STRING(Err);
 885           return nullptr;
 886         }
 887       } else {
 888         DP("Loading global exec_mode '%s' - symbol missing, using default "
 889            "value GENERIC (1)\n",
 890            ExecModeName);
 891       }
 892
 893       KernelsList.emplace_back(Func, ExecModeVal);
 894
 895       __tgt_offload_entry Entry = *E;
 896       Entry.addr = &KernelsList.back();
 897       addOffloadEntry(DeviceId, Entry);
 898     }
 899
 900     // send device environment data to the device
 901     {
 902       // TODO: The device ID used here is not the real device ID used by OpenMP.
 903       DeviceEnvironmentTy DeviceEnv{0, static_cast<uint32_t>(NumberOfDevices),
 904                                     static_cast<uint32_t>(DeviceId),
 905                                     static_cast<uint32_t>(DynamicMemorySize)};
 906
 907       if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
 908         DeviceEnv.DebugKind = std::stoi(EnvStr);
 909
 910       const char *DeviceEnvName = "omptarget_device_environment";
 911       CUdeviceptr DeviceEnvPtr;
 912       size_t CUSize;
 913
 914       Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName);
 915       if (Err == CUDA_SUCCESS) {
 916         if (CUSize != sizeof(DeviceEnv)) {
 917           REPORT(
 918               "Global device_environment '%s' - size mismatch (%zu != %zu)\n",
 919               DeviceEnvName, CUSize, sizeof(int32_t));
 920           CUDA_ERR_STRING(Err);
 921           return nullptr;
 922         }
 923
 924         Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize);
 925         if (Err != CUDA_SUCCESS) {
 926           REPORT("Error when copying data from host to device. Pointers: "
 927                  "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
 928                  DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize);
 929           CUDA_ERR_STRING(Err);
 930           return nullptr;
 931         }
 932
 933         DP("Sending global device environment data %zu bytes\n", CUSize);
 934       } else {
 935         DP("Finding global device environment '%s' - symbol missing.\n",
 936            DeviceEnvName);
 937         DP("Continue, considering this is a device RTL which does not accept "
 938            "environment setting.\n");
 939       }
 940     }
 941
 942     return getOffloadEntriesTable(DeviceId);
 943   }
 944
 945   void *dataAlloc(const int DeviceId, const int64_t Size,
 946                   const TargetAllocTy Kind) {
 947     switch (Kind) {
 948     case TARGET_ALLOC_DEFAULT:
 949     case TARGET_ALLOC_DEVICE:
 950       if (UseMemoryManager)
 951         return MemoryManagers[DeviceId]->allocate(Size, nullptr);
 952       else
 953         return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
 954     case TARGET_ALLOC_HOST:
 955     case TARGET_ALLOC_SHARED:
 956       return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
 957     }
 958
 959     REPORT("Invalid target data allocation kind or requested allocator not "
 960            "implemented yet\n");
 961
 962     return nullptr;
 963   }
 964
 965   int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr,
 966                  const int64_t Size, __tgt_async_info *AsyncInfo) const {
 967     assert(AsyncInfo && "AsyncInfo is nullptr");
 968
 969     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
 970     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
 971       return OFFLOAD_FAIL;
 972
 973     CUstream Stream = getStream(DeviceId, AsyncInfo);
 974
 975     Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
 976     if (Err != CUDA_SUCCESS) {
 977       DP("Error when copying data from host to device. Pointers: host "
 978          "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
 979          DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
 980       CUDA_ERR_STRING(Err);
 981       return OFFLOAD_FAIL;
 982     }
 983
 984     return OFFLOAD_SUCCESS;
 985   }
 986
 987   int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr,
 988                    const int64_t Size, __tgt_async_info *AsyncInfo) const {
 989     assert(AsyncInfo && "AsyncInfo is nullptr");
 990
 991     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
 992     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
 993       return OFFLOAD_FAIL;
 994
 995     CUstream Stream = getStream(DeviceId, AsyncInfo);
 996
 997     Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
 998     if (Err != CUDA_SUCCESS) {
 999       DP("Error when copying data from device to host. Pointers: host "
1000          "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
1001          DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
1002       CUDA_ERR_STRING(Err);
1003       return OFFLOAD_FAIL;
1004     }
1005
1006     return OFFLOAD_SUCCESS;
1007   }
1008
1009   int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr,
1010                    int64_t Size, __tgt_async_info *AsyncInfo) const {
1011     assert(AsyncInfo && "AsyncInfo is nullptr");
1012
1013     CUresult Err = cuCtxSetCurrent(DeviceData[SrcDevId].Context);
1014     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
1015       return OFFLOAD_FAIL;
1016
1017     CUstream Stream = getStream(SrcDevId, AsyncInfo);
1018
1019     // If they are two devices, we try peer to peer copy first
1020     if (SrcDevId != DstDevId) {
1021       int CanAccessPeer = 0;
1022       Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId);
1023       if (Err != CUDA_SUCCESS) {
1024         REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32
1025                ", dst = %" PRId32 "\n",
1026                SrcDevId, DstDevId);
1027         CUDA_ERR_STRING(Err);
1028         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1029       }
1030
1031       if (!CanAccessPeer) {
1032         DP("P2P memcpy not supported so fall back to D2D memcpy");
1033         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1034       }
1035
1036       Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0);
1037       if (Err != CUDA_SUCCESS) {
1038         REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32
1039                ", dst = %" PRId32 "\n",
1040                SrcDevId, DstDevId);
1041         CUDA_ERR_STRING(Err);
1042         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1043       }
1044
1045       Err = cuMemcpyPeerAsync((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context,
1046                               (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context,
1047                               Size, Stream);
1048       if (Err == CUDA_SUCCESS)
1049         return OFFLOAD_SUCCESS;
1050
1051       DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD
1052          ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32 "\n",
1053          DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId);
1054       CUDA_ERR_STRING(Err);
1055     }
1056
1057     return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1058   }
1059
1060   int dataDelete(const int DeviceId, void *TgtPtr) {
1061     if (UseMemoryManager)
1062       return MemoryManagers[DeviceId]->free(TgtPtr);
1063
1064     return DeviceAllocators[DeviceId].free(TgtPtr);
1065   }
1066
1067   int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs,
1068                           ptrdiff_t *TgtOffsets, const int ArgNum,
1069                           const int TeamNum, const int ThreadLimit,
1070                           const unsigned int LoopTripCount,
1071                           __tgt_async_info *AsyncInfo) const {
1072     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
1073     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
1074       return OFFLOAD_FAIL;
1075
1076     // All args are references.
1077     std::vector<void *> Args(ArgNum);
1078     std::vector<void *> Ptrs(ArgNum);
1079
1080     for (int I = 0; I < ArgNum; ++I) {
1081       Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
1082       Args[I] = &Ptrs[I];
1083     }
1084
1085     KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr);
1086
1087     const bool IsSPMDGenericMode =
1088         KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD;
1089     const bool IsSPMDMode =
1090         KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_SPMD;
1091     const bool IsGenericMode =
1092         KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC;
1093
1094     int CudaThreadsPerBlock;
1095     if (ThreadLimit > 0) {
1096       DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
1097       CudaThreadsPerBlock = ThreadLimit;
1098       // Add master warp if necessary
1099       if (IsGenericMode) {
1100         DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize);
1101         CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
1102       }
1103     } else {
1104       DP("Setting CUDA threads per block to default %d\n",
1105          DeviceData[DeviceId].NumThreads);
1106       CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads;
1107     }
1108
1109     if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) {
1110       DP("Threads per block capped at device limit %d\n",
1111          DeviceData[DeviceId].ThreadsPerBlock);
1112       CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;
1113     }
1114
1115     if (!KernelInfo->MaxThreadsPerBlock) {
1116       Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock,
1117                                CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
1118                                KernelInfo->Func);
1119       if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n"))
1120         return OFFLOAD_FAIL;
1121     }
1122
1123     if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) {
1124       DP("Threads per block capped at kernel limit %d\n",
1125          KernelInfo->MaxThreadsPerBlock);
1126       CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock;
1127     }
1128
1129     unsigned int CudaBlocksPerGrid;
1130     if (TeamNum <= 0) {
1131       if (LoopTripCount > 0 && EnvNumTeams < 0) {
1132         if (IsSPMDGenericMode) {
1133           // If we reach this point, then we are executing a kernel that was
1134           // transformed from Generic-mode to SPMD-mode. This kernel has
1135           // SPMD-mode execution, but needs its blocks to be scheduled
1136           // differently because the current loop trip count only applies to the
1137           // `teams distribute` region and will create var too few blocks using
1138           // the regular SPMD-mode method.
1139           CudaBlocksPerGrid = LoopTripCount;
1140         } else if (IsSPMDMode) {
1141           // We have a combined construct, i.e. `target teams distribute
1142           // parallel for [simd]`. We launch so many teams so that each thread
1143           // will execute one iteration of the loop. round up to the nearest
1144           // integer
1145           CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1;
1146         } else if (IsGenericMode) {
1147           // If we reach this point, then we have a non-combined construct, i.e.
1148           // `teams distribute` with a nested `parallel for` and each team is
1149           // assigned one iteration of the `distribute` loop. E.g.:
1150           //
1151           // #pragma omp target teams distribute
1152           // for(...loop_tripcount...) {
1153           //   #pragma omp parallel for
1154           //   for(...) {}
1155           // }
1156           //
1157           // Threads within a team will execute the iterations of the `parallel`
1158           // loop.
1159           CudaBlocksPerGrid = LoopTripCount;
1160         } else {
1161           REPORT("Unknown execution mode: %d\n",
1162                  static_cast<int8_t>(KernelInfo->ExecutionMode));
1163           return OFFLOAD_FAIL;
1164         }
1165         DP("Using %d teams due to loop trip count %" PRIu32
1166            " and number of threads per block %d\n",
1167            CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock);
1168       } else {
1169         DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams);
1170         CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams;
1171       }
1172     } else if (TeamNum > DeviceData[DeviceId].BlocksPerGrid) {
1173       DP("Capping number of teams to team limit %d\n",
1174          DeviceData[DeviceId].BlocksPerGrid);
1175       CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid;
1176     } else {
1177       DP("Using requested number of teams %d\n", TeamNum);
1178       CudaBlocksPerGrid = TeamNum;
1179     }
1180
1181     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
1182          "Launching kernel %s with %d blocks and %d threads in %s mode\n",
1183          (getOffloadEntry(DeviceId, TgtEntryPtr))
1184              ? getOffloadEntry(DeviceId, TgtEntryPtr)->name
1185              : "(null)",
1186          CudaBlocksPerGrid, CudaThreadsPerBlock,
1187          (!IsSPMDMode ? (IsGenericMode ? "Generic" : "SPMD-Generic") : "SPMD"));
1188
1189     CUstream Stream = getStream(DeviceId, AsyncInfo);
1190     Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,
1191                          /* gridDimZ */ 1, CudaThreadsPerBlock,
1192                          /* blockDimY */ 1, /* blockDimZ */ 1,
1193                          DynamicMemorySize, Stream, &Args[0], nullptr);
1194     if (!checkResult(Err, "Error returned from cuLaunchKernel\n"))
1195       return OFFLOAD_FAIL;
1196
1197     DP("Launch of entry point at " DPxMOD " successful!\n",
1198        DPxPTR(TgtEntryPtr));
1199
1200     return OFFLOAD_SUCCESS;
1201   }
1202
1203   int synchronize(const int DeviceId, __tgt_async_info *AsyncInfo) const {
1204     CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue);
1205     CUresult Err = cuStreamSynchronize(Stream);
1206
1207     // Once the stream is synchronized, return it to stream pool and reset
1208     // AsyncInfo. This is to make sure the synchronization only works for its
1209     // own tasks.
1210     StreamPool[DeviceId]->release(reinterpret_cast<CUstream>(AsyncInfo->Queue));
1211     AsyncInfo->Queue = nullptr;
1212
1213     if (Err != CUDA_SUCCESS) {
1214       DP("Error when synchronizing stream. stream = " DPxMOD
1215          ", async info ptr = " DPxMOD "\n",
1216          DPxPTR(Stream), DPxPTR(AsyncInfo));
1217       CUDA_ERR_STRING(Err);
1218     }
1219     return (Err == CUDA_SUCCESS) ? OFFLOAD_SUCCESS : OFFLOAD_FAIL;
1220   }
1221
1222   void printDeviceInfo(int32_t device_id) {
1223     char TmpChar[1000];
1224     std::string TmpStr;
1225     size_t TmpSt;
1226     int TmpInt, TmpInt2, TmpInt3;
1227
1228     CUdevice Device;
1229     checkResult(cuDeviceGet(&Device, device_id),
1230                 "Error returned from cuCtxGetDevice\n");
1231
1232     cuDriverGetVersion(&TmpInt);
1233     printf("    CUDA Driver Version: \t\t%d \n", TmpInt);
1234     printf("    CUDA Device Number: \t\t%d \n", device_id);
1235     checkResult(cuDeviceGetName(TmpChar, 1000, Device),
1236                 "Error returned from cuDeviceGetName\n");
1237     printf("    Device Name: \t\t\t%s \n", TmpChar);
1238     checkResult(cuDeviceTotalMem(&TmpSt, Device),
1239                 "Error returned from cuDeviceTotalMem\n");
1240     printf("    Global Memory Size: \t\t%zu bytes \n", TmpSt);
1241     checkResult(cuDeviceGetAttribute(
1242                     &TmpInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, Device),
1243                 "Error returned from cuDeviceGetAttribute\n");
1244     printf("    Number of Multiprocessors: \t\t%d \n", TmpInt);
1245     checkResult(
1246         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, Device),
1247         "Error returned from cuDeviceGetAttribute\n");
1248     printf("    Concurrent Copy and Execution: \t%s \n", BOOL2TEXT(TmpInt));
1249     checkResult(cuDeviceGetAttribute(
1250                     &TmpInt, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, Device),
1251                 "Error returned from cuDeviceGetAttribute\n");
1252     printf("    Total Constant Memory: \t\t%d bytes\n", TmpInt);
1253     checkResult(
1254         cuDeviceGetAttribute(
1255             &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, Device),
1256         "Error returned from cuDeviceGetAttribute\n");
1257     printf("    Max Shared Memory per Block: \t%d bytes \n", TmpInt);
1258     checkResult(
1259         cuDeviceGetAttribute(
1260             &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, Device),
1261         "Error returned from cuDeviceGetAttribute\n");
1262     printf("    Registers per Block: \t\t%d \n", TmpInt);
1263     checkResult(
1264         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device),
1265         "Error returned from cuDeviceGetAttribute\n");
1266     printf("    Warp Size: \t\t\t\t%d Threads \n", TmpInt);
1267     checkResult(cuDeviceGetAttribute(
1268                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Device),
1269                 "Error returned from cuDeviceGetAttribute\n");
1270     printf("    Maximum Threads per Block: \t\t%d \n", TmpInt);
1271     checkResult(cuDeviceGetAttribute(
1272                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device),
1273                 "Error returned from cuDeviceGetAttribute\n");
1274     checkResult(cuDeviceGetAttribute(
1275                     &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, Device),
1276                 "Error returned from cuDeviceGetAttribute\n");
1277     checkResult(cuDeviceGetAttribute(
1278                     &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, Device),
1279                 "Error returned from cuDeviceGetAttribute\n");
1280     printf("    Maximum Block Dimensions: \t\t%d, %d, %d \n", TmpInt, TmpInt2,
1281            TmpInt3);
1282     checkResult(cuDeviceGetAttribute(
1283                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, Device),
1284                 "Error returned from cuDeviceGetAttribute\n");
1285     checkResult(cuDeviceGetAttribute(
1286                     &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, Device),
1287                 "Error returned from cuDeviceGetAttribute\n");
1288     checkResult(cuDeviceGetAttribute(
1289                     &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, Device),
1290                 "Error returned from cuDeviceGetAttribute\n");
1291     printf("    Maximum Grid Dimensions: \t\t%d x %d x %d \n", TmpInt, TmpInt2,
1292            TmpInt3);
1293     checkResult(
1294         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_PITCH, Device),
1295         "Error returned from cuDeviceGetAttribute\n");
1296     printf("    Maximum Memory Pitch: \t\t%d bytes \n", TmpInt);
1297     checkResult(cuDeviceGetAttribute(
1298                     &TmpInt, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, Device),
1299                 "Error returned from cuDeviceGetAttribute\n");
1300     printf("    Texture Alignment: \t\t\t%d bytes \n", TmpInt);
1301     checkResult(
1302         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, Device),
1303         "Error returned from cuDeviceGetAttribute\n");
1304     printf("    Clock Rate: \t\t\t%d kHz\n", TmpInt);
1305     checkResult(cuDeviceGetAttribute(
1306                     &TmpInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, Device),
1307                 "Error returned from cuDeviceGetAttribute\n");
1308     printf("    Execution Timeout: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1309     checkResult(
1310         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, Device),
1311         "Error returned from cuDeviceGetAttribute\n");
1312     printf("    Integrated Device: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1313     checkResult(cuDeviceGetAttribute(
1314                     &TmpInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, Device),
1315                 "Error returned from cuDeviceGetAttribute\n");
1316     printf("    Can Map Host Memory: \t\t%s \n", BOOL2TEXT(TmpInt));
1317     checkResult(
1318         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, Device),
1319         "Error returned from cuDeviceGetAttribute\n");
1320     if (TmpInt == CU_COMPUTEMODE_DEFAULT)
1321       TmpStr = "DEFAULT";
1322     else if (TmpInt == CU_COMPUTEMODE_PROHIBITED)
1323       TmpStr = "PROHIBITED";
1324     else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS)
1325       TmpStr = "EXCLUSIVE PROCESS";
1326     else
1327       TmpStr = "unknown";
1328     printf("    Compute Mode: \t\t\t%s \n", TmpStr.c_str());
1329     checkResult(cuDeviceGetAttribute(
1330                     &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, Device),
1331                 "Error returned from cuDeviceGetAttribute\n");
1332     printf("    Concurrent Kernels: \t\t%s \n", BOOL2TEXT(TmpInt));
1333     checkResult(
1334         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, Device),
1335         "Error returned from cuDeviceGetAttribute\n");
1336     printf("    ECC Enabled: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1337     checkResult(cuDeviceGetAttribute(
1338                     &TmpInt, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, Device),
1339                 "Error returned from cuDeviceGetAttribute\n");
1340     printf("    Memory Clock Rate: \t\t\t%d kHz\n", TmpInt);
1341     checkResult(
1342         cuDeviceGetAttribute(
1343             &TmpInt, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, Device),
1344         "Error returned from cuDeviceGetAttribute\n");
1345     printf("    Memory Bus Width: \t\t\t%d bits\n", TmpInt);
1346     checkResult(cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE,
1347                                      Device),
1348                 "Error returned from cuDeviceGetAttribute\n");
1349     printf("    L2 Cache Size: \t\t\t%d bytes \n", TmpInt);
1350     checkResult(cuDeviceGetAttribute(
1351                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
1352                     Device),
1353                 "Error returned from cuDeviceGetAttribute\n");
1354     printf("    Max Threads Per SMP: \t\t%d \n", TmpInt);
1355     checkResult(cuDeviceGetAttribute(
1356                     &TmpInt, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, Device),
1357                 "Error returned from cuDeviceGetAttribute\n");
1358     printf("    Async Engines: \t\t\t%s (%d) \n", BOOL2TEXT(TmpInt), TmpInt);
1359     checkResult(cuDeviceGetAttribute(
1360                     &TmpInt, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, Device),
1361                 "Error returned from cuDeviceGetAttribute\n");
1362     printf("    Unified Addressing: \t\t%s \n", BOOL2TEXT(TmpInt));
1363     checkResult(cuDeviceGetAttribute(
1364                     &TmpInt, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, Device),
1365                 "Error returned from cuDeviceGetAttribute\n");
1366     printf("    Managed Memory: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1367     checkResult(
1368         cuDeviceGetAttribute(
1369             &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, Device),
1370         "Error returned from cuDeviceGetAttribute\n");
1371     printf("    Concurrent Managed Memory: \t\t%s \n", BOOL2TEXT(TmpInt));
1372     checkResult(
1373         cuDeviceGetAttribute(
1374             &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, Device),
1375         "Error returned from cuDeviceGetAttribute\n");
1376     printf("    Preemption Supported: \t\t%s \n", BOOL2TEXT(TmpInt));
1377     checkResult(cuDeviceGetAttribute(
1378                     &TmpInt, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, Device),
1379                 "Error returned from cuDeviceGetAttribute\n");
1380     printf("    Cooperative Launch: \t\t%s \n", BOOL2TEXT(TmpInt));
1381     checkResult(cuDeviceGetAttribute(
1382                     &TmpInt, CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, Device),
1383                 "Error returned from cuDeviceGetAttribute\n");
1384     printf("    Multi-Device Boars: \t\t%s \n", BOOL2TEXT(TmpInt));
1385     checkResult(
1386         cuDeviceGetAttribute(
1387             &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device),
1388         "Error returned from cuDeviceGetAttribute\n");
1389     checkResult(
1390         cuDeviceGetAttribute(
1391             &TmpInt2, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device),
1392         "Error returned from cuDeviceGetAttribute\n");
1393     printf("    Compute Capabilities: \t\t%d%d \n", TmpInt, TmpInt2);
1394   }
1395
1396   int createEvent(void **P) {
1397     CUevent Event = nullptr;
1398     if (EventPool.acquire(Event) != OFFLOAD_SUCCESS)
1399       return OFFLOAD_FAIL;
1400     *P = Event;
1401     return OFFLOAD_SUCCESS;
1402   }
1403
1404   int destroyEvent(void *EventPtr) {
1405     EventPool.release(reinterpret_cast<CUevent>(EventPtr));
1406     return OFFLOAD_SUCCESS;
1407   }
1408
1409   int waitEvent(const int DeviceId, __tgt_async_info *AsyncInfo,
1410                 void *EventPtr) const {
1411     CUstream Stream = getStream(DeviceId, AsyncInfo);
1412     CUevent Event = reinterpret_cast<CUevent>(EventPtr);
1413
1414     // We don't use CU_EVENT_WAIT_DEFAULT here as it is only available from
1415     // specific CUDA version, and defined as 0x0. In previous version, per CUDA
1416     // API document, that argument has to be 0x0.
1417     CUresult Err = cuStreamWaitEvent(Stream, Event, 0);
1418     if (Err != CUDA_SUCCESS) {
1419       DP("Error when waiting event. stream = " DPxMOD ", event = " DPxMOD "\n",
1420          DPxPTR(Stream), DPxPTR(Event));
1421       CUDA_ERR_STRING(Err);
1422       return OFFLOAD_FAIL;
1423     }
1424
1425     return OFFLOAD_SUCCESS;
1426   }
1427 };
1428
1429 DeviceRTLTy DeviceRTL;
1430 } // namespace
1431
1432 // Exposed library API function
1433 #ifdef __cplusplus
1434 extern "C" {
1435 #endif
1436
1437 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
1438   return elf_check_machine(image, /* EM_CUDA */ 190);
1439 }
1440
1441 int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); }
1442
1443 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
1444   DP("Init requires flags to %" PRId64 "\n", RequiresFlags);
1445   DeviceRTL.setRequiresFlag(RequiresFlags);
1446   return RequiresFlags;
1447 }
1448
1449 int32_t __tgt_rtl_is_data_exchangable(int32_t src_dev_id, int dst_dev_id) {
1450   if (DeviceRTL.isValidDeviceId(src_dev_id) &&
1451       DeviceRTL.isValidDeviceId(dst_dev_id))
1452     return 1;
1453
1454   return 0;
1455 }
1456
1457 int32_t __tgt_rtl_init_device(int32_t device_id) {
1458   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1459
1460   return DeviceRTL.initDevice(device_id);
1461 }
1462
1463 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
1464                                           __tgt_device_image *image) {
1465   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1466
1467   return DeviceRTL.loadBinary(device_id, image);
1468 }
1469
1470 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *,
1471                            int32_t kind) {
1472   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1473
1474   return DeviceRTL.dataAlloc(device_id, size, (TargetAllocTy)kind);
1475 }
1476
1477 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
1478                               int64_t size) {
1479   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1480
1481   __tgt_async_info AsyncInfo;
1482   const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr,
1483                                                  size, &AsyncInfo);
1484   if (rc != OFFLOAD_SUCCESS)
1485     return OFFLOAD_FAIL;
1486
1487   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1488 }
1489
1490 int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr,
1491                                     void *hst_ptr, int64_t size,
1492                                     __tgt_async_info *async_info_ptr) {
1493   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1494   assert(async_info_ptr && "async_info_ptr is nullptr");
1495
1496   return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size,
1497                               async_info_ptr);
1498 }
1499
1500 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
1501                                 int64_t size) {
1502   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1503
1504   __tgt_async_info AsyncInfo;
1505   const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr,
1506                                                    size, &AsyncInfo);
1507   if (rc != OFFLOAD_SUCCESS)
1508     return OFFLOAD_FAIL;
1509
1510   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1511 }
1512
1513 int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr,
1514                                       void *tgt_ptr, int64_t size,
1515                                       __tgt_async_info *async_info_ptr) {
1516   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1517   assert(async_info_ptr && "async_info_ptr is nullptr");
1518
1519   return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size,
1520                                 async_info_ptr);
1521 }
1522
1523 int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr,
1524                                       int dst_dev_id, void *dst_ptr,
1525                                       int64_t size,
1526                                       __tgt_async_info *AsyncInfo) {
1527   assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1528   assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1529   assert(AsyncInfo && "AsyncInfo is nullptr");
1530
1531   return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size,
1532                                 AsyncInfo);
1533 }
1534
1535 int32_t __tgt_rtl_data_exchange(int32_t src_dev_id, void *src_ptr,
1536                                 int32_t dst_dev_id, void *dst_ptr,
1537                                 int64_t size) {
1538   assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1539   assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1540
1541   __tgt_async_info AsyncInfo;
1542   const int32_t rc = __tgt_rtl_data_exchange_async(
1543       src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, &AsyncInfo);
1544   if (rc != OFFLOAD_SUCCESS)
1545     return OFFLOAD_FAIL;
1546
1547   return __tgt_rtl_synchronize(src_dev_id, &AsyncInfo);
1548 }
1549
1550 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
1551   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1552
1553   return DeviceRTL.dataDelete(device_id, tgt_ptr);
1554 }
1555
1556 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
1557                                          void **tgt_args,
1558                                          ptrdiff_t *tgt_offsets,
1559                                          int32_t arg_num, int32_t team_num,
1560                                          int32_t thread_limit,
1561                                          uint64_t loop_tripcount) {
1562   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1563
1564   __tgt_async_info AsyncInfo;
1565   const int32_t rc = __tgt_rtl_run_target_team_region_async(
1566       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1567       thread_limit, loop_tripcount, &AsyncInfo);
1568   if (rc != OFFLOAD_SUCCESS)
1569     return OFFLOAD_FAIL;
1570
1571   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1572 }
1573
1574 int32_t __tgt_rtl_run_target_team_region_async(
1575     int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
1576     ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
1577     int32_t thread_limit, uint64_t loop_tripcount,
1578     __tgt_async_info *async_info_ptr) {
1579   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1580
1581   return DeviceRTL.runTargetTeamRegion(
1582       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1583       thread_limit, loop_tripcount, async_info_ptr);
1584 }
1585
1586 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
1587                                     void **tgt_args, ptrdiff_t *tgt_offsets,
1588                                     int32_t arg_num) {
1589   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1590
1591   __tgt_async_info AsyncInfo;
1592   const int32_t rc = __tgt_rtl_run_target_region_async(
1593       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &AsyncInfo);
1594   if (rc != OFFLOAD_SUCCESS)
1595     return OFFLOAD_FAIL;
1596
1597   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1598 }
1599
1600 int32_t __tgt_rtl_run_target_region_async(int32_t device_id,
1601                                           void *tgt_entry_ptr, void **tgt_args,
1602                                           ptrdiff_t *tgt_offsets,
1603                                           int32_t arg_num,
1604                                           __tgt_async_info *async_info_ptr) {
1605   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1606
1607   return __tgt_rtl_run_target_team_region_async(
1608       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num,
1609       /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0,
1610       async_info_ptr);
1611 }
1612
1613 int32_t __tgt_rtl_synchronize(int32_t device_id,
1614                               __tgt_async_info *async_info_ptr) {
1615   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1616   assert(async_info_ptr && "async_info_ptr is nullptr");
1617   assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr");
1618
1619   return DeviceRTL.synchronize(device_id, async_info_ptr);
1620 }
1621
1622 void __tgt_rtl_set_info_flag(uint32_t NewInfoLevel) {
1623   std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal();
1624   InfoLevel.store(NewInfoLevel);
1625 }
1626
1627 void __tgt_rtl_print_device_info(int32_t device_id) {
1628   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1629   DeviceRTL.printDeviceInfo(device_id);
1630 }
1631
1632 int32_t __tgt_rtl_create_event(int32_t device_id, void **event) {
1633   assert(event && "event is nullptr");
1634   return DeviceRTL.createEvent(event);
1635 }
1636
1637 int32_t __tgt_rtl_record_event(int32_t device_id, void *event_ptr,
1638                                __tgt_async_info *async_info_ptr) {
1639   assert(async_info_ptr && "async_info_ptr is nullptr");
1640   assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr");
1641   assert(event_ptr && "event_ptr is nullptr");
1642
1643   return recordEvent(event_ptr, async_info_ptr);
1644 }
1645
1646 int32_t __tgt_rtl_wait_event(int32_t device_id, void *event_ptr,
1647                              __tgt_async_info *async_info_ptr) {
1648   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1649   assert(async_info_ptr && "async_info_ptr is nullptr");
1650   assert(event_ptr && "event is nullptr");
1651
1652   return DeviceRTL.waitEvent(device_id, async_info_ptr, event_ptr);
1653 }
1654
1655 int32_t __tgt_rtl_sync_event(int32_t device_id, void *event_ptr) {
1656   assert(event_ptr && "event is nullptr");
1657
1658   return syncEvent(event_ptr);
1659 }
1660
1661 int32_t __tgt_rtl_destroy_event(int32_t device_id, void *event_ptr) {
1662   assert(event_ptr && "event is nullptr");
1663
1664   return DeviceRTL.destroyEvent(event_ptr);
1665 }
1666
1667 #ifdef __cplusplus
1668 }
1669 #endif